-
Notifications
You must be signed in to change notification settings - Fork 17
/
file-to-text.py
91 lines (77 loc) · 3.48 KB
/
file-to-text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import whisper
import json
from datetime import timedelta
def format_timestamp(seconds):
"""Convert seconds to HH:MM:SS format"""
td = timedelta(seconds=seconds)
hours = td.seconds // 3600
minutes = (td.seconds // 60) % 60
seconds = td.seconds % 60
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
def transcribe_with_timestamps(audio_path, model="medium"):
# Load the larger model for improved accuracy
model = whisper.load_model(model)
# Transcribe with word timestamps and adjust parameters
result = model.transcribe(
audio_path,
language="fr",
word_timestamps=True,
condition_on_previous_text=False, # Reduces dependency on previous text, reducing cumulative errors
no_speech_threshold=0.5, # Adjusts silence detection sensitivity
logprob_threshold=-1.0 # Accepts lower confidence predictions
)
# Format segments
formatted_segments = []
for segment in result["segments"]:
formatted_segment = {
"start_time": format_timestamp(segment["start"]),
"end_time": format_timestamp(segment["end"]),
"text": segment["text"].strip(),
}
formatted_segments.append(formatted_segment)
# Save to JSON file
with open("transcription.json", "w", encoding="utf-8") as f:
json.dump(formatted_segments, f, ensure_ascii=False, indent=2)
# Save as formatted text (more readable)
with open("transcription.txt", "w", encoding="utf-8") as f:
for segment in formatted_segments:
f.write(f'[{segment["start_time"]} -> {segment["end_time"]}]\n')
f.write(f'{segment["text"]}\n\n')
# Create markdown table format with translation column
with open("transcription_table.md", "w", encoding="utf-8") as f:
f.write("| Thời gian bắt đầu | Thời gian kết thúc | Nội dung | Dịch tiếng Việt |\n")
f.write("|-------------------|-------------------|----------|----------------|\n")
for segment in formatted_segments:
f.write(f'| {segment["start_time"]} | {segment["end_time"]} | {segment["text"]} | |\n')
# Create a Word file with a 4-column table
try:
from docx import Document
doc = Document()
table = doc.add_table(rows=1, cols=4)
table.style = 'Table Grid'
# Add header
header_cells = table.rows[0].cells
header_cells[0].text = 'Thời gian bắt đầu'
header_cells[1].text = 'Thời gian kết thúc'
header_cells[2].text = 'Nội dung'
header_cells[3].text = 'Dịch tiếng Việt'
# Add data
for segment in formatted_segments:
row_cells = table.add_row().cells
row_cells[0].text = segment["start_time"]
row_cells[1].text = segment["end_time"]
row_cells[2].text = segment["text"]
row_cells[3].text = "" # Translation column left blank
doc.save('transcription.docx')
print("Đã tạo file Word thành công!")
except ImportError:
print("Để tạo file Word, hãy cài đặt python-docx: pip install python-docx")
return formatted_segments
# Usage
audio_file = "demo.WAV" # Change to your audio file path
segments = transcribe_with_timestamps(audio_file)
# Print results for verification
for segment in segments:
print(f'[{segment["start_time"]} -> {segment["end_time"]}]')
print(segment["text"])
print()