Merge pull request #1 from josebenitezg/add_tts

add new features
josebenitezg · Nov 8, 2023 · e5b86a2 · e5b86a2
2 parents c220806 + 30082de
commit e5b86a2
Show file tree

Hide file tree

Showing 6 changed files with 266 additions and 30 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,7 +3,9 @@ venv/
 data/
 *.png
 *.mp4
+*.mp3
 *.jpg
 build/
 dist/
-*.egg-info/
+*.egg-info/
+test_api.py
diff --git a/README.md b/README.md
@@ -1,56 +1,136 @@
-## VisionAPI 👀 🚧
+# VisionAPI 👓✨ - AI Vision & Language Processing
 
-#### Hey there
+### Welcome to the Future of AI Vision 🌟
 
-This is a Work In Progress Project.
-The goal is to bring GPT-based Models to a simple API
+Hello and welcome to VisionAPI, where cutting-edge GPT-based models meet simplicity in a sleek API interface. Our mission is to harness the power of AI to work with images, videos, and audio to create Apps fasther than ever.
 
-### How to use
+### 🚀 Getting Started
 
-##### Installation
+#### Prerequisites
+
+Make sure you have Python installed on your system and you're ready to dive into the world of AI.
+
+#### 📦 Installation
+
+To install VisionAPI, simply run the following command in your terminal:
 
 ```bash
 pip install visionapi
 ```
-##### Authentication
+##### 🔑 Authentication
+Before you begin, authenticate your OpenAI API key with the following command:
 
 ```bash
-export OPENAI_API_KEY=<your key>
+export OPENAI_API_KEY='your-api-key-here'
 ```
-##### Image Inference
-We can use an image url, local image path or numpy array to make an inference.
+#### 🔩 Usage
+##### 🖼️ Image Inference
+Empower your applications to understand and describe images with precision.
 
 ```python
 import visionapi
 
-inference_endpoint = visionapi.Inference()
+# Initialize the Inference Engine
+inference = visionapi.Inference()
 
+# Provide an image URL or a local path
 image = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
-prompt = "Describe the image"
+# Set your descriptive prompt
+prompt = "What is this image about?"
 
-response = inference_endpoint.image_inference(image, prompt)
+# Get the AI's perspective
+response = inference.image(image, prompt)
 
+# Revel in the AI-generated description
 print(response.message.content)
 
+
 ```
-##### Video Inference
+##### 🎥 Video Inference
+Narrate the stories unfolding in your videos with our AI-driven descriptions.
 
 ```python
 import visionapi
 
-inference_endpoint = visionapi.Inference()
+# Gear up the Inference Engine
+inference = visionapi.Inference()
 
-prompt = "These are frames from a video that I want to upload. Generate a compelling description that I can upload along with the video."
+# Craft a captivating prompt
+prompt = "Summarize the key moments in this video."
 
-video = "video.mp4"
+# Point to your video file
+video = "path/to/video.mp4"
 
-response = inference_endpoint.video_inference(video, prompt)
+# Let the AI weave the narrative
+response = inference.video(video, prompt)
 
+# Display the narrative
 print(response.message.content)
 
+```
+
+##### 🎨 Image Generation
+Watch your words paint pictures with our intuitive image generation capabilities.
+
+```python
+import visionapi
+
+# Activate the Inference Engine
+inference = visionapi.Inference()
+
+# Describe your vision
+prompt = "A tranquil lake at sunset with mountains in the background."
+
+# Bring your vision to life
+image_urls = inference.generate_image(prompt, save=True)  # Set `save=True` to store locally
 
+# Behold the AI-crafted imagery
+print(image_urls)
 ```
 
+##### 🗣️ TTS (Text to Speech)
+Transform your text into natural-sounding speech with just a few lines of code.
+
+```python
+import visionapi
+
+# Power up the Inference Engine
+inference = visionapi.Inference()
+
+# Specify where to save the audio
+save_path = "output/speech.mp3"
+
+# Type out what you need to vocalize
+text = "Hey, ready to explore AI-powered speech synthesis?"
+
+# Make the AI speak
+inference.TTS(text, save_path)
+```
+
+##### 🎧 STT (Speech to Text)
+Convert audio into text with unparalleled clarity, opening up a world of possibilities.
+
+```python
+import visionapi
+
+# Initialize the Inference Engine
+inference = visionapi.Inference()
+
+# Convert spoken words to written text
+text = inference.STT('path/to/audio.mp3')
+
+# Marvel at the transcription
+print(text)
+```
+
+## 🌐 Contribute
+Add cool stuff:
+
+- Fork the repository.
+- Extend the capabilities by integrating more models.
+- Enhance existing features or add new ones.
+- Submit a pull request with your improvements.
+
+Your contributions are what make VisionAPI not just a tool, but a community.
 
-Contribute to this project by adding more models and features.
diff --git a/VisionAPI/__pycache__/app.cpython-311.pyc b/VisionAPI/__pycache__/app.cpython-311.pyc
diff --git a/VisionAPI/app.py b/VisionAPI/app.py
@@ -1,22 +1,61 @@
 import os
 import cv2
 import base64
+import requests
 import numpy as np
+from pathlib import Path
 from visionapi.utils import encode_video
 from openai import OpenAI
 
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 GPT_MODEL = "gpt-4-vision-preview"
+TTS_MODEL = "tts-1"  
+STT_MODEL = "whisper-1"
+VOICE = "alloy"  
+IMAGE_GEN_MODEL = "dall-e-3"
+
 class Inference:
+    '''
+    The Inference class provides methods to perform image and video inference, 
+    text-to-speech conversion, and image generation with OpenAI's models.
+
+    Attributes:
+    - client (OpenAI): The OpenAI client initialized with the API key.
+    - gpt_model (str): The identifier for the GPT model used for inferences.
+    
+    Methods:
+    - image_inference: Processes an image with a given prompt and returns the result.
+    - video_inference: Processes a video with a given prompt and returns the result.
+    - webcam_inference: Placeholder for future webcam processing functionality.
+    - text_to_speech: Converts given text to speech and saves it to a file.
+    - generate_image: Generates images based on a text prompt and optionally saves them to disk.
+
+    Initialization:
+    The constructor initializes the OpenAI client and sets the GPT model. 
+    It requires an environment variable 'OPENAI_API_KEY' to be set for the API key.
+    
+    Example:
+    inference = Inference()
+    '''
     def __init__(self):
         self.client = OpenAI()
         if OPENAI_API_KEY is None:
             raise ValueError("API_KEY is not set")
         self.api_key = OPENAI_API_KEY
         self.gpt_model = GPT_MODEL
 
-    def image_inference(self, image_input, prompt) -> str:
-    # Function to encode numpy array or image file to base64
+    def image(self, image_input, prompt) -> str:
+        '''
+        Processes an image or image URL with a given text prompt using the specified GPT model.
+        Returns the inference result as a string.
+        
+        Parameters:
+        - image_input (np.ndarray or str): A numpy array of the image or a URL pointing to an image.
+        - prompt (str): A text prompt for the GPT model to interpret the image.
+
+        Example:
+        response = inference.image(image_array, "What is in this image?")
+        '''
         def encode_to_base64(image):
             if isinstance(image, np.ndarray):
                 success, encoded_image = cv2.imencode('.jpg', image)
@@ -56,10 +95,17 @@ def encode_to_base64(image):
         return response.choices[0]
 
 
-    def video_inference(self, input_video, prompt) -> str:
+    def video(self, input_video, prompt) -> str:
         '''
-        We don't need to send every frame for GPT 
-        to understand what's going on
+        Processes a video file with a given text prompt and returns the result.
+        The method selects frames from the video and sends them for processing.
+
+        Parameters:
+        - input_video (str): A file path to the input video.
+        - prompt (str): A text prompt for the GPT model to interpret the video.
+
+        Example:
+        response = inference.video_inference("path/to/video.mp4", "Summarize the actions in the video.")
         '''
         base64Frames = encode_video(input_video)
 
@@ -81,6 +127,114 @@ def video_inference(self, input_video, prompt) -> str:
 
         result = self.client.chat.completions.create(**params)
         return result.choices[0].message.content
+
+    def TTS(self, text, save_path, stream=True):
+        '''
+        Converts given text to speech using the TTS model and saves the output as an audio file.
+
+        Parameters:
+        - text (str): The text to convert to speech.
+        - save_path (str): The file path where the audio will be saved.
+        - stream (bool): If True, streams the audio in real-time (default False).
+
+        Example:
+        inference.text_to_speech("Hello world!", "path/to/save/speech.mp3", stream=True)
+        '''
+        # Ensure the save_path is a Path object
+        save_path = Path(save_path)
+
+        # Prepare the TTS API parameters
+        tts_params = {
+            "model": TTS_MODEL,
+            "voice": VOICE,
+            "input": text
+        }
+
+        # Create the TTS response
+        response = self.client.audio.speech.create(**tts_params)
 
-    def webcam_inference(self, prompt, device=0) -> str:
-        print('coming soon')
+        # If streaming is desired, stream to file, otherwise save the file normally
+        if stream:
+            response.stream_to_file(str(save_path))
+        else:
+            with save_path.open("wb") as f:
+                f.write(response.audio)
+
+        return str(save_path)
+
+    def generate_image(self, prompt, size="1024x1024", quality="standard", qty=1, save=False):
+        '''
+        Generates images based on a text prompt using the DALL-E 3 model.
+        
+        Parameters:
+        - prompt (str): The text prompt to generate images from.
+        - size (str): The size of the generated images (default "1024x1024").
+        - quality (str): The quality of the generated images, e.g., "standard".
+        - n (int): The number of images to generate. Note: DALL-E 3 API currently
+        allows generating one image at a time, so this method will make `n` separate
+        requests to generate `n` images.
+        - save (bool): If True, saves the generated images to disk in the current 
+        directory. The filenames will be `generated_image_1.png`, `generated_image_2.png`, etc.
+        
+        Returns:
+        - List of URLs of the generated images if `save` is False. If `save` is True,
+        returns a list of file paths to the saved images.
+        
+        Example usage:
+        inference = Inference()
+        image_urls = inference.generate_image("a white siamese cat", n=10, save=True)
+        '''
+        image_urls = []
+
+        for _ in range(qty):  # Loop over the number of images requested
+            # Create the image generation response
+            response = self.client.images.generate(
+                model=IMAGE_GEN_MODEL,
+                prompt=prompt,
+                size=size,
+                quality=quality,
+                n=1  # Each request generates one image
+            )
+            image_url = response.data[0].url
+            image_urls.append(image_url)
+
+            if save:
+                # Save the image to disk
+                image_response = requests.get(image_url)
+                image_response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code
+
+                # Construct the image filename
+                image_number = len(image_urls)
+                image_path = Path(f"generated_image_{image_number}.png")
+                with open(image_path, "wb") as f:
+                    f.write(image_response.content)
+                image_urls[-1] = str(image_path)  # Update the URL to local path
+
+        return image_urls
+
+    def STT(self, audio_file_path, response_format="text"):
+        '''
+        Converts speech from an audio file to text using the Whisper model.
+
+        Parameters:
+        - audio_file_path (str): The file path to the audio file to transcribe.
+        - response_format (str): The format of the response. Can be "json" or "text". Default is "json".
+
+        Returns:
+        - If response_format is "json", returns the full JSON response from the API.
+        - If response_format is "text", returns just the transcribed text as a string.
+
+        Example usage:
+        transcript = inference.speech_to_text("/path/to/audio.mp3")
+        print(transcript)
+        '''
+        with open(audio_file_path, "rb") as audio_file:
+            transcript = self.client.audio.transcriptions.create(
+                model=STT_MODEL,
+                file=audio_file,
+                response_format=response_format
+            )
+        if response_format == "text":
+            return transcript
+        else:
+            return transcript["text"]
diff --git a/examples/__pycache__/app.cpython-311.pyc b/examples/__pycache__/app.cpython-311.pyc
diff --git a/examples/app.py b/examples/app.py
@@ -8,13 +8,13 @@
 import gradio as gr
 import numpy as np
 
-import visionapi as VisionAPI
+import visionapi
 
 HEADER = """
 # 📸 VisionAPI. 
 """
 
-inference_endpoint = VisionAPI.Inference()
+inference = visionapi.Inference()
 
 def save_image_to_drive(image: np.ndarray) -> str:
     image_filename = f"{uuid.uuid4()}.jpeg"
@@ -29,7 +29,7 @@ def respond(image: np.ndarray, prompt: str, chat_history):
     image = np.fliplr(image)
     image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
     image_path = save_image_to_drive(image)
-    response = inference_endpoint.image_inference(image, prompt)
+    response = inference.image(image, prompt)
     chat_history.append(((image_path,), None))
     chat_history.append((prompt, response.message.content))
     return "", chat_history