mudler
diff --git a/‎backend/python/mlx-vlm/Makefile‎
Lines changed: 29 additions & 0 deletions b/‎backend/python/mlx-vlm/Makefile‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎backend/python/mlx-vlm/backend.py‎
Lines changed: 367 additions & 0 deletions b/‎backend/python/mlx-vlm/backend.py‎
Lines changed: 367 additions & 0 deletions
@@ -0,0 +1,29 @@
+.PHONY: mlx
+mlx: protogen
+	bash install.sh
+
+.PHONY: run
+run: protogen
+	@echo "Running mlx..."
+	bash run.sh
+	@echo "mlx run."
+
+.PHONY: test
+test: protogen
+	@echo "Testing mlx..."
+	bash test.sh
+	@echo "mlx tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
@@ -0,0 +1,367 @@
+#!/usr/bin/env python3
+import asyncio
+from concurrent import futures
+import argparse
+import signal
+import sys
+import os
+from typing import List
+from PIL import Image
+
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.sampling_params import SamplingParams
+from vllm.utils import random_uuid
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.multimodal.utils import fetch_image
+from vllm.assets.video import VideoAsset
+import base64
+import io
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    A gRPC servicer that implements the Backend service defined in backend.proto.
+    """
+    def generate(self,prompt, max_new_tokens):
+        """
+        Generates text based on the given prompt and maximum number of new tokens.
+
+        Args:
+            prompt (str): The prompt to generate text from.
+            max_new_tokens (int): The maximum number of new tokens to generate.
+
+        Returns:
+            str: The generated text.
+        """
+        self.generator.end_beam_search()
+
+        # Tokenizing the input
+        ids = self.generator.tokenizer.encode(prompt)
+
+        self.generator.gen_begin_reuse(ids)
+        initial_len = self.generator.sequence[0].shape[0]
+        has_leading_space = False
+        decoded_text = ''
+        for i in range(max_new_tokens):
+            token = self.generator.gen_single_token()
+            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
+                has_leading_space = True
+
+            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
+            if has_leading_space:
+                decoded_text = ' ' + decoded_text
+
+            if token.item() == self.generator.tokenizer.eos_token_id:
+                break
+        return decoded_text
+
+    def Health(self, request, context):
+        """
+        Returns a health check message.
+
+        Args:
+            request: The health check request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Reply: The health check reply.
+        """
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    async def LoadModel(self, request, context):
+        """
+        Loads a language model.
+
+        Args:
+            request: The load model request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Result: The load model result.
+        """
+        engine_args = AsyncEngineArgs(
+            model=request.Model,
+        )
+
+        if request.Quantization != "":
+            engine_args.quantization = request.Quantization
+        if request.LoadFormat != "":
+            engine_args.load_format = request.LoadFormat
+        if request.GPUMemoryUtilization != 0:
+            engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
+        if request.TrustRemoteCode:
+            engine_args.trust_remote_code = request.TrustRemoteCode
+        if request.EnforceEager:
+            engine_args.enforce_eager = request.EnforceEager
+        if request.TensorParallelSize:
+            engine_args.tensor_parallel_size = request.TensorParallelSize
+        if request.SwapSpace != 0:
+            engine_args.swap_space = request.SwapSpace
+        if request.MaxModelLen != 0:
+            engine_args.max_model_len = request.MaxModelLen
+        if request.DisableLogStatus:
+            engine_args.disable_log_status = request.DisableLogStatus
+        if request.DType != "":
+            engine_args.dtype = request.DType
+        if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
+            # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
+            engine_args.limit_mm_per_prompt = {
+                "image": max(request.LimitImagePerPrompt, 1),
+                "video": max(request.LimitVideoPerPrompt, 1),
+                "audio": max(request.LimitAudioPerPrompt, 1)
+            }
+
+        try:
+            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
+        except Exception as err:
+            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        try:
+           engine_model_config = await self.llm.get_model_config()
+           self.tokenizer = get_tokenizer(
+               engine_model_config.tokenizer,
+               tokenizer_mode=engine_model_config.tokenizer_mode,
+               trust_remote_code=engine_model_config.trust_remote_code,
+               truncation_side="left",
+           )
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        print("Model loaded successfully", file=sys.stderr)
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    async def Predict(self, request, context):
+        """
+        Generates text based on the given prompt and sampling parameters.
+
+        Args:
+            request: The predict request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Reply: The predict result.
+        """
+        gen = self._predict(request, context, streaming=False)
+        res = await gen.__anext__()
+        return res
+
+    def Embedding(self, request, context):
+        """
+        A gRPC method that calculates embeddings for a given sentence.
+
+        Args:
+            request: An EmbeddingRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            An EmbeddingResult object that contains the calculated embeddings.
+        """
+        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
+        outputs = self.model.encode(request.Embeddings)
+        # Check if we have one result at least
+        if len(outputs) == 0:
+            context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+            context.set_details("No embeddings were calculated.")
+            return backend_pb2.EmbeddingResult()
+        return backend_pb2.EmbeddingResult(embeddings=outputs[0].outputs.embedding)
+
+    async def PredictStream(self, request, context):
+        """
+        Generates text based on the given prompt and sampling parameters, and streams the results.
+
+        Args:
+            request: The predict stream request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Result: The predict stream result.
+        """
+        iterations = self._predict(request, context, streaming=True)
+        try:
+            async for iteration in iterations:
+                yield iteration
+        finally:
+            await iterations.aclose()
+
+    async def _predict(self, request, context, streaming=False):
+        # Build the sampling parameters
+        # NOTE: this must stay in sync with the vllm backend
+        request_to_sampling_params = {
+            "N": "n",
+            "PresencePenalty": "presence_penalty",
+            "FrequencyPenalty": "frequency_penalty",
+            "RepetitionPenalty": "repetition_penalty",
+            "Temperature": "temperature",
+            "TopP": "top_p",
+            "TopK": "top_k",
+            "MinP": "min_p",
+            "Seed": "seed",
+            "StopPrompts": "stop",
+            "StopTokenIds": "stop_token_ids",
+            "BadWords": "bad_words",
+            "IncludeStopStrInOutput": "include_stop_str_in_output",
+            "IgnoreEOS": "ignore_eos",
+            "Tokens": "max_tokens",
+            "MinTokens": "min_tokens",
+            "Logprobs": "logprobs",
+            "PromptLogprobs": "prompt_logprobs",
+            "SkipSpecialTokens": "skip_special_tokens",
+            "SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
+            "TruncatePromptTokens": "truncate_prompt_tokens",
+            "GuidedDecoding": "guided_decoding",
+        }
+
+        sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
+
+        for request_field, param_field in request_to_sampling_params.items():
+            if hasattr(request, request_field):
+                value = getattr(request, request_field)
+                if value not in (None, 0, [], False):
+                    setattr(sampling_params, param_field, value)
+
+        # Extract image paths and process images
+        prompt = request.Prompt
+
+        image_paths = request.Images
+        image_data = [self.load_image(img_path) for img_path in image_paths]
+
+        videos_path = request.Videos
+        video_data = [self.load_video(video_path) for video_path in videos_path]
+
+        # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
+        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
+            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
+
+        # Generate text using the LLM engine
+        request_id = random_uuid()
+        print(f"Generating text with request_id: {request_id}", file=sys.stderr)
+        multi_modal_data = {}
+        if image_data:
+            multi_modal_data["image"] = image_data
+        if video_data:
+            multi_modal_data["video"] = video_data
+        outputs = self.llm.generate(
+            {
+            "prompt": prompt,
+            "multi_modal_data": multi_modal_data if multi_modal_data else None,
+            },
+            sampling_params=sampling_params,
+            request_id=request_id,
+        )
+
+        # Stream the results
+        generated_text = ""
+        try:
+            async for request_output in outputs:
+                iteration_text = request_output.outputs[0].text
+
+                if streaming:
+                    # Remove text already sent as vllm concatenates the text from previous yields
+                    delta_iteration_text = iteration_text.removeprefix(generated_text)
+                    # Send the partial result
+                    yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8'))
+
+                # Keep track of text generated
+                generated_text = iteration_text
+        finally:
+            await outputs.aclose()
+
+        # If streaming, we already sent everything
+        if streaming:
+            return
+
+        # Remove the image files from /tmp folder
+        for img_path in image_paths:
+            try:
+                os.remove(img_path)
+            except Exception as e:
+                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
+
+        # Sending the final generated text
+        yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
+
+    def load_image(self, image_path: str):
+        """
+        Load an image from the given file path or base64 encoded data.
+
+        Args:
+            image_path (str): The path to the image file or base64 encoded data.
+
+        Returns:
+            Image: The loaded image.
+        """
+        try:
+
+            image_data = base64.b64decode(image_path)
+            image = Image.open(io.BytesIO(image_data))
+            return image
+        except Exception as e:
+            print(f"Error loading image {image_path}: {e}", file=sys.stderr)
+            return None
+
+    def load_video(self, video_path: str):
+        """
+        Load a video from the given file path.
+
+        Args:
+            video_path (str): The path to the image file.
+
+        Returns:
+            Video: The loaded video.
+        """
+        try:
+            timestamp = str(int(time.time() * 1000))  # Generate timestamp
+            p = f"/tmp/vl-{timestamp}.data"  # Use timestamp in filename
+            with open(p, "wb") as f:
+                f.write(base64.b64decode(video_path))
+            video = VideoAsset(name=p).np_ndarrays
+            os.remove(p)
+            return video
+        except Exception as e:
+            print(f"Error loading video {video_path}: {e}", file=sys.stderr)
+            return None
+
+async def serve(address):
+    # Start asyncio gRPC server
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
+    # Add the servicer to the server
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    # Bind the server to the address
+    server.add_insecure_port(address)
+
+    # Gracefully shutdown the server on SIGTERM or SIGINT
+    loop = asyncio.get_event_loop()
+    for sig in (signal.SIGINT, signal.SIGTERM):
+        loop.add_signal_handler(
+            sig, lambda: asyncio.ensure_future(server.stop(5))
+        )
+
+    # Start the server
+    await server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+    # Wait for the server to be terminated
+    await server.wait_for_termination()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    asyncio.run(serve(args.addr))