meta-llama · dvrogozh · Dec 3, 2024 · Dec 6, 2024
@@ -74,6 +74,7 @@ def build(
         model_parallel_size: Optional[int] = None,
         tokenizer_path: Optional[str] = None,
         seed: int = 1,
+        device: str = "cuda"
     ):
         """
         Build a Llama instance by initializing and loading a model checkpoint.
@@ -85,30 +86,43 @@ def build(
             max_batch_size (int): Maximum batch size for inference.
             model_parallel_size (Optional[int], optional): Number of model parallel processes.
                 If not provided, it's determined from the environment. Defaults to None.
+            device (str, optional): Device to use, e.g. cuda (default), xpu, cpu, etc.
 
         Returns:
             Llama: An instance of the Llama class with the loaded model and tokenizer.
 
         Raises:
             AssertionError: If there are no checkpoint files in the specified directory,
                 or if the model parallel size does not match the number of checkpoint files.
+            RuntimeError: If PyTorch backend for the specified device is not available.
 
 
         Note:
             This method initializes the distributed process group, sets the device to CUDA,
             and loads the pre-trained model and tokenizer.
         """
 
+        device = torch.device(device)
+        if (device.type == "cuda" and not torch.cuda.is_available() or
+            device.type == "xpu" and not torch.xpu.is_available()):
+            raise RuntimeError(f"PyTorch backend for {device.type} device type is not available")
+
         if not torch.distributed.is_initialized():
-            torch.distributed.init_process_group("nccl")
+            if device.type == "cuda":
+                torch.distributed.init_process_group("nccl")
+            else:
+                torch.distributed.init_process_group("gloo")
 
         if not model_parallel_is_initialized():
             if model_parallel_size is None:
                 model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
             initialize_model_parallel(model_parallel_size)
 
         local_rank = int(os.environ.get("LOCAL_RANK", 0))
-        torch.cuda.set_device(local_rank)
+        if device.type == "cuda":
+            torch.cuda.set_device(local_rank)
+        elif device.type == "xpu":
+            torch.xpu.set_device(local_rank)
 
         torch.manual_seed(seed)
 
@@ -138,18 +152,29 @@ def build(
             tokenizer = Tokenizer.get_instance()
 
         assert model_args.vocab_size == tokenizer.n_words
-        if torch.cuda.is_bf16_supported():
-            torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
+        torch.set_default_device(device)
+        if device.type == "cuda":
+            if torch.cuda.is_bf16_supported():
+                torch.set_default_dtype(torch.bfloat16)
+            else:
+                torch.set_default_dtype(torch.half)
+        elif device.type == "xpu":
+            if torch.xpu.is_bf16_supported():
+                torch.set_default_dtype(torch.bfloat16)
+            else:
+                torch.set_default_dtype(torch.half)
         else:
-            torch.set_default_tensor_type(torch.cuda.HalfTensor)
+            torch.set_default_dtype(torch.half)
+
         if model_args.vision_chunk_size > 0:
             from .multimodal.model import CrossAttentionTransformer
 
             model = CrossAttentionTransformer(model_args)
-            model.setup_cache(model_args.max_batch_size, torch.bfloat16)
+            model.setup_cache(model_args.max_batch_size, torch.get_default_dtype())
         else:
             model = Transformer(model_args)
         model.load_state_dict(checkpoint, strict=True)
+        model.to(device)
         print(f"Loaded in {time.time() - start_time:.2f} seconds")
 
         return Llama(model, tokenizer, model_args)
@@ -213,14 +238,14 @@ def generate(
             )
 
         pad_id = self.tokenizer.pad_id
-        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
+        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long)
         for k, t in enumerate(prompt_tokens):
-            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
+            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long)
         if logprobs:
             token_logprobs = torch.zeros_like(tokens, dtype=torch.float)
 
         prev_pos = 0
-        eos_reached = torch.tensor([False] * bsz, device="cuda")
+        eos_reached = torch.tensor([False] * bsz)
         input_text_mask = tokens != pad_id
 
         if echo:
@@ -237,7 +262,7 @@ def generate(
         for cur_pos in range(min_prompt_len, total_len):
             if is_vision:
                 position_ids = torch.arange(
-                    prev_pos, cur_pos, dtype=torch.long, device="cuda"
+                    prev_pos, cur_pos, dtype=torch.long
                 )
                 text_only_inference = model_input.vision is None
                 logits = self.model.forward(

@@ -158,15 +158,15 @@ def __init__(self, args: ModelArgs):
                 self.n_local_kv_heads,
                 self.head_dim,
             )
-        ).cuda()
+        )
         self.cache_v = torch.zeros(
             (
                 args.max_batch_size,
                 args.max_seq_len,
                 self.n_local_kv_heads,
                 self.head_dim,
             )
-        ).cuda()
+        )
 
     def forward(
         self,

@@ -1113,7 +1113,7 @@ def forward(
         # aspect_ratios: (B, T)
         # h: (B, T, D)
         vision_tokens = self.vision_encoder(
-            images.to(dtype=torch.bfloat16), aspect_ratios
+            images.to(dtype=torch.get_default_dtype()), aspect_ratios
         )
 
         vision_tokens = F.linear(
@@ -1407,8 +1407,6 @@ def compute_vision_tokens_masks(
         else:
             vision_tokens = self.vision_model(stacked_images, aspect_ratios)
 
-        vision_tokens = vision_tokens.to("cuda")
-
         bsz, nimg, nchunk, ntok, image_token_dim = tuple(vision_tokens.shape)
         xattn_caches = torch.stack(
             [
@@ -1428,7 +1426,7 @@ def compute_vision_tokens_masks(
         cross_attention_masks, full_text_row_masked_out_mask = (
             self.text_model._get_xattn_mask(
                 num_tokens=total_len,
-                text_device="cuda",
+                text_device=vision_tokens.device.type,
                 text_dtype=next(self.text_model.parameters()).dtype,
                 vision_tokens=vision_tokens,
                 cross_attention_masks=padded_masks,
@@ -1495,7 +1493,7 @@ def _pad_masks(
     total_len: int,
     max_num_chunks: int,
 ) -> torch.Tensor:
-    dtype = torch.bfloat16
+    dtype = torch.get_default_dtype()
     inf_value = get_negative_inf_value(dtype)
 
     bsz = len(all_masks)

@@ -33,6 +33,7 @@ def build_generator(env_var: str):
         max_seq_len=128,
         max_batch_size=1,
         model_parallel_size=1,
+        device=os.getenv("DEVICE", "cuda")
     )