Merge branch 'master' into baichuan_support

microsoft · Dec 13, 2023 · 213a014 · 213a014
2 parents c102f4f + 3324efd
commit 213a014
Show file tree

Hide file tree

Showing 42 changed files with 666 additions and 129 deletions.
diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
@@ -3,9 +3,9 @@ name: nv-a6000
 on:
  pull_request:
  paths:
- - "deepspeed/inference/v2/**"
- - "tests/unit/inference/v2/**"
- - ".github/workflows/nv-a6000.yml"
+ - 'deepspeed/inference/v2/**'
+ - 'tests/unit/inference/v2/**'
+ - '.github/workflows/nv-a6000.yml'
  workflow_dispatch:
 
 concurrency:

diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
@@ -6,7 +6,7 @@ on:
  - 'docs/**'
  - 'blogs/**'
  - 'deepspeed/inference/v2/**'
- - "tests/unit/inference/v2/**"
+ - 'tests/unit/inference/v2/**'
  merge_group:
  branches: [ master ]
  schedule:

diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
@@ -6,7 +6,7 @@ on:
  - 'docs/**'
  - 'blogs/**'
  - 'deepspeed/inference/v2/**'
- - "tests/unit/inference/v2/**"
+ - 'tests/unit/inference/v2/**'
  merge_group:
  branches: [ master ]
  schedule:

diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
@@ -6,7 +6,7 @@ on:
  - 'docs/**'
  - 'blogs/**'
  - 'deepspeed/inference/v2/**'
- - "tests/unit/inference/v2/**"
+ - 'tests/unit/inference/v2/**'
  merge_group:
  branches: [ master ]
  schedule:

diff --git a/.github/workflows/nv-megatron.yml b/.github/workflows/nv-megatron.yml
@@ -6,7 +6,7 @@ on:
  - 'docs/**'
  - 'blogs/**'
  - 'deepspeed/inference/v2/**'
- - "tests/unit/inference/v2/**"
+ - 'tests/unit/inference/v2/**'
  merge_group:
  branches: [ master ]
  schedule:

diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
@@ -8,7 +8,7 @@ on:
  - 'docs/**'
  - 'blogs/**'
  - 'deepspeed/inference/v2/**'
- - "tests/unit/inference/v2/**"
+ - 'tests/unit/inference/v2/**'
  merge_group:
  branches: [ master ]
  schedule:
@@ -19,7 +19,7 @@ concurrency:
  cancel-in-progress: true
 
 jobs:
- build-ops:
+ unit-tests:
  runs-on: ubuntu-20.04
  container:
  image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116

diff --git a/.github/workflows/nv-torch-latest-cpu.yml b/.github/workflows/nv-torch-latest-cpu.yml
@@ -6,7 +6,7 @@ on:
  - 'docs/**'
  - 'blogs/**'
  - 'deepspeed/inference/v2/**'
- - "tests/unit/inference/v2/**"
+ - 'tests/unit/inference/v2/**'
  merge_group:
  branches: [ master ]
  schedule:

diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
@@ -6,7 +6,7 @@ on:
  - 'docs/**'
  - 'blogs/**'
  - 'deepspeed/inference/v2/**'
- - "tests/unit/inference/v2/**"
+ - 'tests/unit/inference/v2/**'
  merge_group:
  branches: [ master ]
  schedule:

diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
@@ -6,7 +6,7 @@ on:
  - 'docs/**'
  - 'blogs/**'
  - 'deepspeed/inference/v2/**'
- - "tests/unit/inference/v2/**"
+ - 'tests/unit/inference/v2/**'
  merge_group:
  branches: [ master ]
  schedule:

diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
@@ -63,7 +63,7 @@ def random(self):
  return torch.random
 
  def set_rng_state(self, new_state, device_index=None):
- if device_index == None:
+ if device_index is None:
  return torch.set_rng_state(new_state)
  return torch.set_rng_state(new_state, device_index)
 
@@ -253,7 +253,7 @@ def on_accelerator(self, tensor):
  # create an instance of op builder and return, name specified by class_name
  def create_op_builder(self, op_name):
  builder_class = self.get_op_builder(op_name)
- if builder_class != None:
+ if builder_class is not None:
  return builder_class()
  return None
 

diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
@@ -44,7 +44,7 @@ def is_synchronized_device(self):
 
  # Device APIs
  def device_name(self, device_index=None):
- if device_index == None:
+ if device_index is None:
  return 'cuda'
  return 'cuda:{}'.format(device_index)
 
@@ -280,7 +280,7 @@ def op_builder_dir(self):
  class_dict = None
 
  def _lazy_init_class_dict(self):
- if self.class_dict != None:
+ if self.class_dict is not None:
  return
  else:
  self.class_dict = {}

diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py
@@ -26,7 +26,7 @@ def is_synchronized_device(self):
 
  # Device APIs
  def device_name(self, device_index=None):
- if device_index == None:
+ if device_index is None:
  return "mps"
  return "mps:{}".format(device_index)
 
@@ -221,7 +221,7 @@ def op_builder_dir(self):
  # create an instance of op builder, specified by class_name
  def create_op_builder(self, op_name):
  builder_class = self.get_op_builder(op_name)
- if builder_class != None:
+ if builder_class is not None:
  return builder_class()
  return None
 

diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py
@@ -30,7 +30,7 @@ def is_synchronized_device(self):
 
  # Device APIs
  def device_name(self, device_index=None):
- if device_index == None:
+ if device_index is None:
  return 'npu'
  return 'npu:{}'.format(device_index)
 

diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
@@ -45,7 +45,7 @@ def _validate_accelerator(accel_obj):
 
 
 def is_current_accelerator_supported():
- return get_accelerator() in SUPPORTED_ACCELERATOR_LIST
+ return get_accelerator().device_name() in SUPPORTED_ACCELERATOR_LIST
 
 
 def get_accelerator():

diff --git a/blogs/deepspeed-fastgen/README.md b/blogs/deepspeed-fastgen/README.md
@@ -228,6 +228,7 @@ We currently support the following model architectures in this alpha release of
 * [LLaMA](https://huggingface.co/models?other=llama) and [LLaMA-2](https://huggingface.co/models?other=llama-2)
 * [Mistral](https://huggingface.co/models?other=mistral)
 * [OPT](https://huggingface.co/models?other=opt)
+* [Falcon](https://huggingface.co/models?other=falcon)
 
 All current models leverage [HuggingFace](https://github.com/huggingface) APIs in our backend to provide both the model weights and the model's corresponding tokenizer.
 

diff --git a/deepspeed/inference/quantization/layers.py b/deepspeed/inference/quantization/layers.py
@@ -86,7 +86,7 @@ def __init__(self, config: Dict, pre_quant_layer: nn.Embedding) -> None:
  device=pre_quant_layer.weight.device,
  dtype=pre_quant_layer.weight.dtype)
 
- assert pre_quant_layer.max_norm == None, 'Not supported'
+ assert pre_quant_layer.max_norm is None, 'Not supported'
  assert pre_quant_layer.norm_type == 2, 'Not supported'
  assert pre_quant_layer.scale_grad_by_freq == False, 'Not supported'
  assert pre_quant_layer.sparse == False, 'Not supported'

diff --git a/deepspeed/inference/v2/engine_factory.py b/deepspeed/inference/v2/engine_factory.py
@@ -17,6 +17,7 @@
  OPTPolicy,
  Llama2Policy,
  MistralPolicy,
+ FalconPolicy,
 )
 from .model_implementations.inference_policy_base import POLICIES, InferenceV2Policy
 from .model_implementations.flat_model_helpers import make_metadata_filename, ModelMetadata
@@ -104,6 +105,8 @@ def build_hf_engine(path: str,
  assert version.parse(transformers.__version__) >= version.parse("4.34.0"), \
  f"Mistral requires transformers >= 4.34.0, you have version {transformers.__version__}"
  policy = MistralPolicy(model_config, checkpoint_engine=checkpoint_engine)
+ elif model_config.model_type == "falcon":
+ policy = FalconPolicy(model_config, checkpoint_engine=checkpoint_engine)
  else:
  raise ValueError(f"Unsupported model type {model_config.model_type}")
 

diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu
@@ -227,6 +227,16 @@ void launch_kv_rotary_kernel(T* kv_cache,
  DISPATCH_KV_ROTARY_IMPL(5, 128)
  DISPATCH_KV_ROTARY_IMPL(8, 64)
  DISPATCH_KV_ROTARY_IMPL(8, 128)
+ DISPATCH_KV_ROTARY_IMPL(16, 64)
+ DISPATCH_KV_ROTARY_IMPL(16, 128)
+ DISPATCH_KV_ROTARY_IMPL(29, 64)
+ DISPATCH_KV_ROTARY_IMPL(29, 128)
+ DISPATCH_KV_ROTARY_IMPL(35, 64)
+ DISPATCH_KV_ROTARY_IMPL(35, 128)
+ DISPATCH_KV_ROTARY_IMPL(36, 64)
+ DISPATCH_KV_ROTARY_IMPL(36, 128)
+ DISPATCH_KV_ROTARY_IMPL(71, 64)
+ DISPATCH_KV_ROTARY_IMPL(71, 128)
 }
 
 #define INSTANTIATE_KV_ROTARY_KERNEL(TYPE) \

diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py
@@ -19,7 +19,7 @@ class BlockedRotaryEmbeddings(DSKernelBase):
 
  supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
  supported_head_sizes = [64, 128]
- supported_q_ratios = [1, 2, 4, 5, 8]
+ supported_q_ratios = [1, 2, 4, 5, 8, 16, 29, 35, 36, 71]
 
  def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype) -> None:
  """

diff --git a/deepspeed/inference/v2/model_implementations/__init__.py b/deepspeed/inference/v2/model_implementations/__init__.py
@@ -12,3 +12,4 @@
 from .llama_v2 import *
 from .opt import *
 from .mistral import *
+from .falcon import *
diff --git a/deepspeed/inference/v2/model_implementations/falcon/__init__.py b/deepspeed/inference/v2/model_implementations/falcon/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .falcon_policy import FalconPolicy
diff --git a/deepspeed/inference/v2/model_implementations/falcon/falcon_containers.py b/deepspeed/inference/v2/model_implementations/falcon/falcon_containers.py
@@ -0,0 +1,129 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ...model_implementations.common_parameters import *
+from ...model_implementations.layer_container_base import LayerContainer
+'''
+ # HF Falcon 7b model looks like this:
+
+FalconForCausalLM(
+ (transformer): FalconModel(
+ (word_embeddings): Embedding(65024, 4544)
+ (h): ModuleList(
+ (0-31): 32 x FalconDecoderLayer(
+ (self_attention): FalconAttention(
+ (maybe_rotary): FalconRotaryEmbedding()
+ (query_key_value): FalconLinear(in_features=4544, out_features=4672, bias=False)
+ (dense): FalconLinear(in_features=4544, out_features=4544, bias=False)
+ (attention_dropout): Dropout(p=0.0, inplace=False)
+ )
+ (mlp): FalconMLP(
+ (dense_h_to_4h): FalconLinear(in_features=4544, out_features=18176, bias=False)
+ (act): GELU(approximate='none')
+ (dense_4h_to_h): FalconLinear(in_features=18176, out_features=4544, bias=False)
+ )
+ (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
+ )
+ )
+ (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
+ )
+ (lm_head): Linear(in_features=4544, out_features=65024, bias=False)
+)
+'''
+
+
+class FalconTransformerContainer(LayerContainer):
+ """
+ Transformer layer container for the Falcon model.
+ """
+ qkv_w: FusedQKVParameter
+ attn_out_w: AttentionOutputParameter
+ mlp_1_w: MLP1Parameter
+ mlp_2_w: MLP2Parameter
+ ln_attn_gamma: NormParameter
+ ln_attn_beta: NormParameter
+
+ PARAM_MAPPING = {
+ "self_attention.query_key_value.weight": "qkv_w.params",
+ "self_attention.dense.weight": "attn_out_w.params",
+ "mlp.dense_h_to_4h.weight": "mlp_1_w.params",
+ "mlp.dense_4h_to_h.weight": "mlp_2_w.params",
+ "input_layernorm.weight": "ln_attn_gamma.params",
+ "input_layernorm.bias": "ln_attn_beta.params",
+ }
+
+
+class FalconNonTransformerContainer(LayerContainer):
+ """
+ Non-Transformer layer container for the Falcon model.
+ """
+ word_emb: EmbeddingParameter
+ word_unembed: UnembedParameter
+ final_norm_gamma: NormParameter
+ final_norm_beta: NormParameter
+
+ PARAM_MAPPING = {
+ "transformer.word_embeddings.weight": "word_emb.params",
+ "transformer.ln_f.weight": "final_norm_gamma.params",
+ "transformer.ln_f.bias": "final_norm_beta.params",
+ "lm_head.weight": "word_unembed.params",
+ }
+
+
+'''
+ # HF Falcon 40b model looks like this:
+
+ FalconForCausalLM(
+ (transformer): FalconModel(
+ (word_embeddings): Embedding(65024, 8192)
+ (h): ModuleList(
+ (0-59): 60 x FalconDecoderLayer(
+ (self_attention): FalconAttention(
+ (maybe_rotary): FalconRotaryEmbedding()
+ (query_key_value): FalconLinear(in_features=8192, out_features=9216, bias=False)
+ (dense): FalconLinear(in_features=8192, out_features=8192, bias=False)
+ (attention_dropout): Dropout(p=0.0, inplace=False)
+ )
+ (mlp): FalconMLP(
+ (dense_h_to_4h): FalconLinear(in_features=8192, out_features=32768, bias=False)
+ (act): GELU(approximate='none')
+ (dense_4h_to_h): FalconLinear(in_features=32768, out_features=8192, bias=False)
+ )
+ (ln_attn): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
+ (ln_mlp): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
+ )
+ )
+ (ln_f): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
+ )
+ (lm_head): Linear(in_features=8192, out_features=65024, bias=False)
+)
+'''
+
+
+class FalconNewArchTransformerContainer(LayerContainer):
+ """
+ Transformer layer container for the Falcon model.
+ """
+ qkv_w: GQAMegatronQKVParameter
+ attn_out_w: AttentionOutputParameter
+ mlp_1_w: MLP1Parameter
+ mlp_2_w: MLP2Parameter
+ ln_attn_gamma: NormParameter
+ ln_attn_beta: NormParameter
+ ln_mlp_gamma: NormParameter
+ ln_mlp_beta: NormParameter
+
+ PARAM_MAPPING = {
+ "self_attention.query_key_value.weight": "qkv_w.params",
+ "self_attention.dense.weight": "attn_out_w.params",
+ "mlp.dense_h_to_4h.weight": "mlp_1_w.params",
+ "mlp.dense_4h_to_h.weight": "mlp_2_w.params",
+ "ln_attn.weight": "ln_attn_gamma.params",
+ "ln_attn.bias": "ln_attn_beta.params",
+ "ln_mlp.weight": "ln_mlp_gamma.params",
+ "ln_mlp.bias": "ln_mlp_beta.params",
+ }