exo-explore · aybanda · Sep 9, 2024
diff --git a/exo/inference/inference_engine.py b/exo/inference/inference_engine.py
@@ -7,6 +7,7 @@
 
 
 class InferenceEngine(ABC):
+
  @abstractmethod
  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
  pass
@@ -15,6 +16,11 @@ async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_s
  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
  pass
 
+ @abstractmethod
+ async def preload_model(self, shard: Shard) -> None:
+ """Preload the model into memory without full initialization."""
+ pass
+
 
 def get_inference_engine(inference_engine_name: str, shard_downloader: 'ShardDownloader'):
  if inference_engine_name == "mlx":

diff --git a/exo/inference/mlx/sharded_inference_engine.py b/exo/inference/mlx/sharded_inference_engine.py
@@ -46,3 +46,24 @@ def load_shard_wrapper(): return asyncio.run(load_shard(model_path, shard))
  model_shard, self.tokenizer = await loop.run_in_executor(self.executor, load_shard_wrapper)
  self.stateful_sharded_model = await loop.run_in_executor(self.executor, StatefulShardedModel, shard, model_shard)
  self.shard = shard
+
+ async def preload_model(self, shard: Shard) -> None:
+ # Implement MLX-specific preloading logic
+ # This might involve loading weights into memory
+ # without fully initializing the model
+ if self.model is None:
+ # Load the model configuration
+ config = await self.load_config(shard)
+
+ # Load the model weights into memory
+ # but don't initialize the full model yet
+ self.weights = await self.load_weights(config, shard)
+
+ async def load_weights(self, config, shard):
+ # Implement weight loading logic here
+ # This should load the weights into memory without full model initialization
+ pass
+
+ def initialize_model(self, weights):
+ # Implement full model initialization using preloaded weights
+ pass
diff --git a/exo/orchestration/standard_node.py b/exo/orchestration/standard_node.py
@@ -432,3 +432,8 @@ async def send_status_to_peer(peer):
  @property
  def current_topology(self) -> Topology:
  return self.topology
+
+ def get_assigned_shards(self):
+ # For a standard node, all shards are assigned to it
+ # Assuming self.shards exists, otherwise adjust accordingly
+ return self.shards if hasattr(self, 'shards') else []
diff --git a/main.py b/main.py
@@ -172,6 +172,14 @@ def handle_exit():
 
  await node.start(wait_for_peers=args.wait_for_peers)
 
+ # Parallelize model preloading
+ shards_to_load = node.get_assigned_shards()
+ await asyncio.gather(*(inference_engine.preload_model(shard) for shard in shards_to_load))
+
+ # Finish initialization sequentially if needed
+ for shard in shards_to_load:
+ await inference_engine.ensure_shard(shard)
+
  if args.run_model:
  await run_model_cli(node, inference_engine, args.run_model, args.prompt)
  else: