From 884c01a7202ee165dd91a337b1b19f536122c51d Mon Sep 17 00:00:00 2001
From: Yohan Totting <tyohan@gmail.com>
Date: Sun, 2 Feb 2025 23:17:01 +0700
Subject: [PATCH] Add tts-to-webrtc example

---
 tts-to-webrtc/README.md  |  79 ++++++++++
 tts-to-webrtc/index.html | 172 +++++++++++++++++++++
 tts-to-webrtc/main.go    | 321 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 572 insertions(+)
 create mode 100644 tts-to-webrtc/README.md
 create mode 100644 tts-to-webrtc/index.html
 create mode 100644 tts-to-webrtc/main.go
diff --git a/tts-to-webrtc/README.md b/tts-to-webrtc/README.md
new file mode 100644
index 00000000..dc152ab4
--- /dev/null
+++ b/tts-to-webrtc/README.md
@@ -0,0 +1,79 @@
+# WebRTC Text-to-Speech Example
+
+This is an example app that combines WebRTC with OpenAI's Text-to-Speech API to stream audio in real-time.
+
+## Prerequisites
+
+- Go 1.20 or later
+- An OpenAI API key
+- Web browser with WebRTC support (Chrome, Firefox, Safari, etc.)
+
+## Installation
+
+1. Clone the repository:
+```bash
+git clone <https://github.com/pion/example-webrtc-applications>
+cd tts-to-webrtc
+```
+
+2. Install module dependencies:
+
+[Resampler](https://github.com/dh1tw/gosamplerate) and [opus encoder](https://github.com/hraban/opus) packages are using  cgo modules and need to setup. Follow the instructions below to install the required packages.
+
+Linux:
+using apt (Ubuntu), yum (Centos)...etc.
+```bash
+    $ sudo apt install libsamplerate0 pkg-config libopus-dev libopusfile-dev
+```
+
+MacOS
+using Homebrew:
+```bash
+    $ brew install libsamplerate pkg-config opus opusfile
+```
+
+3. Install Go dependencies:
+```bash
+export GO111MODULE=on
+go install github.com/pion/example-webrtc-applications/v4/tts-to-webrtc@latest
+```
+
+## Configuration
+
+Set your OpenAI API key as an environment variable:
+
+```bash
+export OPENAI_API_KEY=your_api_key_here
+```
+
+## Running the Application
+
+1. Start the server:
+```bash
+go run main.go
+```
+
+2. Open your web browser and navigate to:
+```
+http://localhost:8080
+```
+
+## Usage
+
+1. Click the "Connect" button to establish a WebRTC connection
+2. Wait for the connection status to show "connected"
+3. Type some text in the textarea
+4. Click "Convert to Speech" to hear the text being spoken
+
+## Technical Details
+
+- The application uses OpenAI's TTS API to convert text to speech
+- Audio is streamed using WebRTC with Opus codec
+- Sample rate conversion is handled automatically (24kHz to 48kHz)
+- The server implements a simple audio buffer to handle streaming
+
+
+
+## License
+
+This project is licensed under the MIT License - see the LICENSE file for details.
\ No newline at end of file
diff --git a/tts-to-webrtc/index.html b/tts-to-webrtc/index.html
new file mode 100644
index 00000000..e5320c0e
--- /dev/null
+++ b/tts-to-webrtc/index.html
@@ -0,0 +1,172 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>WebRTC TTS Demo</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 20px;
+        }
+        .form-group {
+            margin-bottom: 15px;
+        }
+        textarea {
+            width: 100%;
+            padding: 8px;
+        }
+        button {
+            padding: 10px 20px;
+            background-color: #4CAF50;
+            color: white;
+            border: none;
+            border-radius: 4px;
+            cursor: pointer;
+        }
+        button:hover {
+            background-color: #45a049;
+        }
+        button.connected {
+            background-color: #dc3545;
+        }
+        button.connected:hover {
+            background-color: #c82333;
+        }
+    </style>
+</head>
+<body>
+    <h1>Text to Speech with WebRTC</h1>
+    <div id="connectionStatus" style="margin-bottom: 20px; padding: 10px; background-color: #f0f0f0; border-radius: 4px;">
+        <div>Connection State: <span id="connectionState">new</span></div>
+        <div>ICE Connection: <span id="iceConnectionState">new</span></div>
+        <div>Signaling State: <span id="signalingState">new</span></div>
+    </div>
+    <div class="form-group">
+        <button id="connectButton" onclick="toggleConnection()">Connect</button>
+    </div>
+    <div class="form-group">
+        <textarea id="textInput" rows="4" placeholder="Enter text to convert to speech"></textarea>
+    </div>
+    <div class="form-group">
+        <button id="convertButton" onclick="submitText()" disabled>Convert to Speech</button>
+    </div>
+
+    <script>
+        let pc;
+        let isConnected = false;
+        
+        async function toggleConnection() {
+            const connectButton = document.getElementById('connectButton');
+            if (!isConnected) {
+                connectButton.disabled = true;
+                await initWebRTC();
+            } else {
+                if (pc) {
+                    await pc.close();
+                    pc = null;
+                }
+                document.getElementById('convertButton').disabled = true;
+                document.getElementById('connectionState').textContent = 'new';
+                document.getElementById('iceConnectionState').textContent = 'new';
+                document.getElementById('signalingState').textContent = 'new';
+                connectButton.textContent = 'Connect';
+                connectButton.classList.remove('connected');
+                isConnected = false;
+            }
+        }
+        
+        async function initWebRTC() {
+            pc = new RTCPeerConnection({
+                iceServers: [{
+                    urls: 'stun:stun.l.google.com:19302'
+                }]
+            });
+
+            // Add connection state monitoring
+            pc.onconnectionstatechange = () => {
+                const state = pc.connectionState;
+                const connectButton = document.getElementById('connectButton');
+                document.getElementById('connectionState').textContent = state;
+                if (state === 'connected') {
+                    document.getElementById('convertButton').disabled = false;
+                    connectButton.disabled = false;
+                    connectButton.textContent = 'Disconnect';
+                    connectButton.classList.add('connected');
+                    isConnected = true;
+                } else {
+                    document.getElementById('convertButton').disabled = true;
+                }
+            };
+
+            pc.oniceconnectionstatechange = () => {
+                document.getElementById('iceConnectionState').textContent = pc.iceConnectionState;
+            };
+
+            pc.onsignalingstatechange = () => {
+                document.getElementById('signalingState').textContent = pc.signalingState;
+            };
+
+            pc.ontrack = function(event) {
+                const audio = new Audio();
+                audio.srcObject = event.streams[0];
+                audio.play();
+            };
+
+            // Create promise to wait for ICE gathering
+            const iceCandidatesComplete = new Promise((resolve) => {
+                pc.onicegatheringstatechange = () => {
+                    if (pc.iceGatheringState === 'complete') {
+                        resolve();
+                    }
+                };
+            });
+
+            pc.addTransceiver('audio', { direction: 'recvonly' });
+
+            const offer = await pc.createOffer({
+                offerToReceiveAudio: true
+            });
+            await pc.setLocalDescription(offer);
+
+            // Wait for ICE gathering to complete
+            await iceCandidatesComplete;
+
+            const response = await fetch('/webrtc', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+                body: JSON.stringify({
+                    sdp: JSON.stringify(pc.localDescription)
+                })
+            });
+
+            const answer = await response.json();
+            await pc.setRemoteDescription(new RTCSessionDescription(answer));
+        }
+
+        async function submitText() {
+            const text = document.getElementById('textInput').value;
+            if (!text) return;
+
+            const response = await fetch('/tts', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+                body: JSON.stringify({
+                    text: text
+                })
+            });
+
+            const audioBlob = await response.blob();
+            const audio = new Audio(URL.createObjectURL(audioBlob));
+            audio.play();
+        }
+
+        // Remove the automatic initWebRTC() call at the end
+    </script>
+</body>
+</html>
diff --git a/tts-to-webrtc/main.go b/tts-to-webrtc/main.go
new file mode 100644
index 00000000..1fac3296
--- /dev/null
+++ b/tts-to-webrtc/main.go
@@ -0,0 +1,321 @@
+package main
+
+import (
+	"bytes"
+	"context"
+	"encoding/binary"
+	"encoding/json"
+	"io"
+	"log"
+	"math"
+	"net/http"
+	"os"
+	"sync"
+	"time"
+
+	"github.com/dh1tw/gosamplerate"
+	"github.com/pion/webrtc/v4"
+	"github.com/pion/webrtc/v4/pkg/media"
+	"gopkg.in/hraban/opus.v2"
+)
+
+type WebRTCMessage struct {
+	SDP string `json:"sdp"`
+}
+
+type TTSRequest struct {
+	Text string `json:"text"`
+}
+
+type TTSResponse struct {
+	Audio []byte
+}
+
+var peerConnection *webrtc.PeerConnection
+
+var audioTrack *webrtc.TrackLocalStaticSample
+
+var ticker *time.Ticker
+
+var samplesNeeded int
+
+var resampleLength, srcSampleRate int
+
+var speechBuffer []int16
+
+var resampler gosamplerate.Src
+
+var err error
+
+var mu sync.Mutex
+
+func main() {
+	srcSampleRate = 24000
+
+	mu = sync.Mutex{}
+
+	ticker = time.NewTicker(20 * time.Millisecond)
+
+	samplesNeeded = int(float32(srcSampleRate) * 0.02) // 16000 Hz * 0.02 seconds
+
+	resampleLength = int(math.Ceil(float64(48000) / float64(srcSampleRate) * float64(samplesNeeded)))
+
+	speechBuffer = make([]int16, 0)
+
+	// Create a new RTCPeerConnection
+	config := webrtc.Configuration{
+		ICEServers: []webrtc.ICEServer{
+			{
+				URLs: []string{"stun:stun.l.google.com:19302"},
+			},
+		},
+	}
+	resampler, err = gosamplerate.New(gosamplerate.SRC_SINC_MEDIUM_QUALITY, 1, resampleLength)
+
+	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
+		http.ServeFile(w, r, "index.html")
+	})
+
+	http.HandleFunc("/webrtc", handleWebRTC(&config))
+	http.HandleFunc("/tts", handleTTS)
+
+	log.Println("Server started at http://localhost:8080")
+	log.Fatal(http.ListenAndServe(":8080", nil))
+}
+
+func handleWebRTC(config *webrtc.Configuration) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		var msg WebRTCMessage
+		if err := json.NewDecoder(r.Body).Decode(&msg); err != nil {
+			http.Error(w, err.Error(), http.StatusBadRequest)
+			return
+		}
+
+		var err error
+		peerConnection, err = webrtc.NewPeerConnection(*config)
+		if err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+			return
+		}
+
+		// Create a new audio track
+		audioTrack, err = webrtc.NewTrackLocalStaticSample(
+			webrtc.RTPCodecCapability{MimeType: "audio/opus"},
+			"audio",
+			"pion",
+		)
+
+		go processSpeechBuffer(context.Background(), audioTrack)
+
+		if err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+			return
+		}
+
+		if _, err = peerConnection.AddTrack(audioTrack); err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+			return
+		}
+
+		offer := webrtc.SessionDescription{}
+		if err := json.Unmarshal([]byte(msg.SDP), &offer); err != nil {
+			http.Error(w, err.Error(), http.StatusBadRequest)
+			return
+		}
+
+		if err := peerConnection.SetRemoteDescription(offer); err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+			return
+		}
+
+		// Create answer
+		answer, err := peerConnection.CreateAnswer(nil)
+		if err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+			return
+		}
+
+		if err := peerConnection.SetLocalDescription(answer); err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+			return
+		}
+
+		response, err := json.Marshal(answer)
+		if err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+			return
+		}
+
+		w.Header().Set("Content-Type", "application/json")
+		w.Write(response)
+	}
+}
+
+func handleTTS(w http.ResponseWriter, r *http.Request) {
+	var req TTSRequest
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		http.Error(w, err.Error(), http.StatusBadRequest)
+		return
+	}
+
+	// Call OpenAI TTS API
+	ttsReq := map[string]interface{}{
+		"model":           "tts-1",
+		"response_format": "pcm",
+		"input":           req.Text,
+		"voice":           "alloy",
+	}
+
+	jsonData, err := json.Marshal(ttsReq)
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+
+	request, err := http.NewRequest("POST", "https://api.openai.com/v1/audio/speech", bytes.NewBuffer(jsonData))
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+
+	request.Header.Set("Authorization", "Bearer "+os.Getenv("OPENAI_API_KEY"))
+	request.Header.Set("Content-Type", "application/json")
+
+	client := &http.Client{}
+	response, err := client.Do(request)
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+	defer response.Body.Close()
+
+	audio, err := io.ReadAll(response.Body)
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+
+	reader := bytes.NewReader(audio)
+	var sample int16
+	var samples []int16
+
+	for {
+		// Read each sample (2 bytes for int16)
+		err := binary.Read(reader, binary.LittleEndian, &sample)
+		if err != nil {
+			if err == io.EOF {
+				break // End of stream
+			}
+			log.Println("Failed to read sample:", err)
+			return
+		}
+
+		// Append the int16 sample directly
+		samples = append(samples, sample)
+	}
+
+	mu.Lock()
+	speechBuffer = append(speechBuffer, samples...)
+	mu.Unlock()
+
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(http.StatusOK)
+	w.Write([]byte(`{"message": "TTS request received"}`))
+}
+
+func processSpeechBuffer(ctx context.Context, localTrack *webrtc.TrackLocalStaticSample) {
+	ctxx, cancel := context.WithCancel(ctx)
+	defer cancel()
+
+	encoder, err := opus.NewEncoder(48000, 2, opus.AppVoIP)
+	if err != nil {
+		log.Printf("Failed to create opus encoder: %v", err)
+		return
+	}
+
+	audioPayload := make([]byte, 1450)
+
+	for {
+		select {
+		case <-ctxx.Done():
+			return
+		case <-ticker.C:
+			mu.Lock()
+			if len(speechBuffer) > 0 {
+				// get earliest samples needed from buffer
+				samplesToTake := min(len(speechBuffer), samplesNeeded)
+				samples := speechBuffer[:samplesToTake]
+				speechBuffer = speechBuffer[samplesToTake:]
+
+				mu.Unlock()
+
+				// convert int16 samples to float32
+				floatSamples := convertToFloat32(samples)
+
+				// resamples audio data to target sample rate
+				resampled, err := resample(floatSamples, srcSampleRate, 48000)
+				if err != nil {
+					log.Printf("Failed to resample audio: %v", err)
+					continue
+				}
+
+				// convert from mono to stereo
+				stereoSamples := make([]float32, len(resampled)*2)
+				for i, sample := range resampled {
+					stereoSamples[i*2] = sample   // Left channel
+					stereoSamples[i*2+1] = sample // Right channel
+				}
+
+				// encode pcm data to opus
+				n, err := encoder.EncodeFloat32(stereoSamples, audioPayload)
+				if err != nil {
+					log.Printf("Failed to encode audio: %v", err)
+					continue
+				}
+
+				if n > 0 {
+					sampleDuration := time.Duration((float32(len(resampled))/48000)*1000) * time.Millisecond
+
+					// write opus packet to local track
+					if err = localTrack.WriteSample(media.Sample{
+						Data:     audioPayload[:n],
+						Duration: sampleDuration,
+					}); err != nil {
+						log.Printf("Failed to write audio to track: %v", err)
+						continue
+					}
+				}
+			} else {
+				mu.Unlock()
+				// write a silence opus sample to local track
+				silencePayload := []byte{0xf8, 0xff, 0xfe} // Opus silence frame
+
+				err = localTrack.WriteSample(media.Sample{
+					Data:     silencePayload,
+					Duration: 20 * time.Millisecond,
+				})
+				if err != nil {
+					log.Printf("Failed to write silence to track: %v", err)
+				}
+			}
+		}
+	}
+}
+
+func convertToFloat32(pcm []int16) []float32 {
+	samples := make([]float32, len(pcm))
+	for i, v := range pcm {
+		samples[i] = float32(v) / float32(1<<15-1)
+	}
+	return samples
+}
+
+func resample(samples []float32, sourceRate int, targetRate int) ([]float32, error) {
+	ratio := float64(targetRate) / float64(sourceRate)
+	resampled, err := resampler.Process(samples, ratio, false)
+	if err != nil {
+		return nil, err
+	}
+
+	return resampled, nil
+}