feat: add tts and speech2text model for gpustack

langgenius · Jan 7, 2025 · 9d1b7cf · 9d1b7cf
1 parent 2dbc5ff
commit 9d1b7cf
Show file tree

Hide file tree

Showing 8 changed files with 120 additions and 19 deletions.
diff --git a/models/gpustack/manifest.yaml b/models/gpustack/manifest.yaml
@@ -26,9 +26,9 @@ resource:
       llm: true
       moderation: false
       rerank: true
-      speech2text: false
+      speech2text: true
       text_embedding: true
-      tts: false
+      tts: true
     tool:
       enabled: true
 type: plugin

diff --git a/models/gpustack/models/llm/llm.py b/models/gpustack/models/llm/llm.py
@@ -2,7 +2,6 @@
 from dify_plugin import OAICompatLargeLanguageModel
 from dify_plugin.entities.model.llm import LLMResult
 from dify_plugin.entities.model.message import PromptMessage, PromptMessageTool
-from yarl import URL
 
 
 class GPUStackLanguageModel(OAICompatLargeLanguageModel):
@@ -17,10 +16,10 @@ def _invoke(
         stream: bool = True,
         user: str | None = None,
     ) -> LLMResult | Generator:
-        self._add_custom_parameters(credentials)
+        compatible_credentials = self._get_compatible_credentials(credentials)
         return super()._invoke(
             model,
-            credentials,
+            compatible_credentials,
             prompt_messages,
             model_parameters,
             tools,
@@ -30,11 +29,14 @@ def _invoke(
         )
 
     def validate_credentials(self, model: str, credentials: dict) -> None:
-        self._add_custom_parameters(credentials)
-        super().validate_credentials(model, credentials)
+        compatible_credentials = self._get_compatible_credentials(credentials)
+        super().validate_credentials(model, compatible_credentials)
 
     def _add_custom_parameters(self, credentials: dict) -> None:
-        credentials["endpoint_url"] = str(
-            URL(credentials["endpoint_url"]) / "v1-openai"
-        )
         credentials["mode"] = "chat"
+
+    def _get_compatible_credentials(self, credentials: dict) -> dict:
+        credentials = credentials.copy()
+        base_url = credentials["endpoint_url"].rstrip("/").removesuffix("/v1-openai")
+        credentials["endpoint_url"] = f"{base_url}/v1-openai"
+        return credentials
diff --git a/models/gpustack/models/speech2text/__init__.py b/models/gpustack/models/speech2text/__init__.py
diff --git a/models/gpustack/models/speech2text/speech2text.py b/models/gpustack/models/speech2text/speech2text.py
@@ -0,0 +1,38 @@
+from typing import Optional
+
+from dify_plugin import OAICompatSpeechToTextModel
+from dify_plugin.entities.model.speech2text import SpeechToTextResult
+
+
+class GPUStackSpeechToTextModel(OAICompatSpeechToTextModel):
+    """
+    Model class for GPUStack Speech to text model.
+    """
+
+    def _invoke(
+        self,
+        model: str,
+        credentials: dict,
+        audio: bytes,
+        user: Optional[str] = None,
+    ) -> SpeechToTextResult:
+        compatible_credentials = self._get_compatible_credentials(credentials)
+        return super()._invoke(model, compatible_credentials, audio, user)
+
+    def validate_credentials(self, model: str, credentials: dict) -> None:
+        """
+        Validate model credentials
+
+        :param model: model name
+        :param credentials: model credentials
+        """
+        compatible_credentials = self._get_compatible_credentials(credentials)
+        super().validate_credentials(model, compatible_credentials)
+
+    def _get_compatible_credentials(self, credentials: dict) -> dict:
+        credentials = credentials.copy()
+        base_url = credentials["endpoint_url"].rstrip("/").removesuffix("/v1-openai")
+        credentials["endpoint_url"] = f"{base_url}/v1-openai"
+        return credentials
+
+
diff --git a/models/gpustack/models/text_embedding/text_embedding.py b/models/gpustack/models/text_embedding/text_embedding.py
@@ -18,15 +18,15 @@ def _invoke(
         user: Optional[str] = None,
         input_type: EmbeddingInputType = EmbeddingInputType.DOCUMENT,
     ) -> TextEmbeddingResult:
-        self._add_custom_parameters(credentials)
-        return super()._invoke(model, credentials, texts, user, input_type)
+        compatible_credentials = self._get_compatible_credentials(credentials)
+        return super()._invoke(model, compatible_credentials, texts, user, input_type)
 
     def validate_credentials(self, model: str, credentials: dict) -> None:
-        self._add_custom_parameters(credentials)
-        super().validate_credentials(model, credentials)
+        compatible_credentials = self._get_compatible_credentials(credentials)
+        super().validate_credentials(model, compatible_credentials)
 
-    @staticmethod
-    def _add_custom_parameters(credentials: dict) -> None:
-        credentials["endpoint_url"] = str(
-            URL(credentials["endpoint_url"]) / "v1-openai"
-        )
+    def _get_compatible_credentials(self, credentials: dict) -> dict:
+        credentials = credentials.copy()
+        base_url = credentials["endpoint_url"].rstrip("/").removesuffix("/v1-openai")
+        credentials["endpoint_url"] = f"{base_url}/v1-openai"
+        return credentials
diff --git a/models/gpustack/models/tts/__init__.py b/models/gpustack/models/tts/__init__.py
diff --git a/models/gpustack/models/tts/tts.py b/models/gpustack/models/tts/tts.py
@@ -0,0 +1,43 @@
+from collections.abc import Generator
+from dify_plugin import OAICompatTextToSpeechModel
+from dify_plugin.entities.model.tts import TTSResult
+
+
+class GPUStackTextToSpeechModel(OAICompatTextToSpeechModel):
+    """
+    Model class for GPUStack Text to Speech model.
+    """
+
+    def _invoke(
+        self,
+        model: str,
+        credentials: dict,
+        text: str,
+        user: str | None = None,
+    ) -> TTSResult | Generator:
+        compatible_credentials = self._get_compatible_credentials(credentials)
+        return super()._invoke(model, compatible_credentials, text, user)
+
+    def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
+        """
+        Validate model credentials
+
+        :param model: model name
+        :param credentials: model credentials
+        :param user: unique user id
+        """
+        compatible_credentials = self._get_compatible_credentials(credentials)
+        super().validate_credentials(model, compatible_credentials)
+
+    def _get_compatible_credentials(self, credentials: dict) -> dict:
+        """
+        Get compatible credentials
+
+        :param credentials: model credentials
+        :return: compatible credentials
+        """
+        compatible_credentials = credentials.copy()
+        base_url = credentials["endpoint_url"].rstrip("/").removesuffix("/v1-openai")
+        compatible_credentials["endpoint_url"] = f"{base_url}/v1-openai"
+
+        return compatible_credentials
diff --git a/models/gpustack/provider/gpustack.yaml b/models/gpustack/provider/gpustack.yaml
@@ -113,6 +113,22 @@ model_credential_schema:
           variable: __model_type
       type: select
       variable: vision_support
+    - variable: voices
+      show_on:
+        - variable: __model_type
+          value: tts
+      label:
+        en_US: Available Voices (comma-separated)
+        zh_Hans: 可用声音（用英文逗号分隔）
+      type: text-input
+      required: false
+      default: "Chinese Female"
+      placeholder:
+        en_US: "Chinese Female, Chinese Male, Japanese Male, Cantonese Female, English Female, English Male, Korean Female"
+        zh_Hans: "Chinese Female, Chinese Male, Japanese Male, Cantonese Female, English Female, English Male, Korean Female"
+      help:
+        en_US: "List voice names separated by commas. First voice will be used as default."
+        zh_Hans: "用英文逗号分隔的声音列表。第一个声音将作为默认值。"
   model:
     label:
       en_US: Model Name
@@ -125,3 +141,5 @@ supported_model_types:
   - llm
   - text-embedding
   - rerank
+  - speech2text
+  - tts