-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstt_page.py
89 lines (71 loc) · 2.68 KB
/
stt_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from transformers import pipeline
import streamlit as st
import torch
from audiorecorder import audiorecorder
import numpy as np
from src.logger import get_logger
# Set up logger
logger = get_logger(name=None)
# Audio processing settings
SAMPLE_RATE = 16000
def transcribe_audio(audio_segment, pipe):
"""Transcribe audio using pipeline"""
logger.debug("Processing audio for transcription")
try:
# Convert audio segment to numpy array
audio_array = np.array(audio_segment.get_array_of_samples())
# Create a dict with the required format for the pipeline
audio_dict = {"array": audio_array, "sampling_rate": SAMPLE_RATE}
result = pipe(audio_dict, batch_size=8, return_timestamps=True)
transcription = result["chunks"][0]["text"] if result["chunks"] else ""
logger.info("Successfully transcribed audio")
return transcription
except Exception as e:
logger.error(f"Error transcribing audio: {str(e)}", exc_info=True)
raise
def main():
logger.info("Starting Speech to Text application")
st.title("Speech to Text Transcription")
# Initialize session state
if "transcriptions" not in st.session_state:
st.session_state.transcriptions = []
# Load pipeline
@st.cache_resource
def load_pipeline():
logger.info("Loading Whisper pipeline")
try:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
"automatic-speech-recognition",
model="openai/whisper-small",
chunk_length_s=30,
device=device,
)
logger.info(f"Successfully loaded pipeline on {device}")
return pipe
except Exception as e:
logger.error(f"Error loading pipeline: {str(e)}", exc_info=True)
raise
pipe = load_pipeline()
# Audio recorder
audio = audiorecorder(
"Start Recording",
"Stop Recording",
custom_style={"color": "black"},
show_visualizer=True,
)
transcription_placeholder = st.empty()
if len(audio) > 0:
st.audio(audio.export().read())
# Process and transcribe the audio
text = transcribe_audio(audio, pipe)
st.session_state.transcriptions.append(f"{text}")
logger.info(f"Added transcription: {text}")
# Display all transcriptions
if st.session_state.transcriptions:
transcription_placeholder.markdown("".join(st.session_state.transcriptions))
if st.button("Clear Transcriptions"):
logger.info("Clearing transcription history")
st.session_state.transcriptions = []
transcription_placeholder.empty()
main()