-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_index.py
24 lines (18 loc) · 816 Bytes
/
build_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import glob
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, STORED
from whoosh.analysis import SimpleAnalyzer
# Reads converted text files from /subtitles and builds a Whoosh search index.
schema = Schema(content=TEXT(stored=True, analyzer=SimpleAnalyzer()), title=STORED, videoId=STORED, dubbed=STORED)
index = create_in("index", schema)
writer = index.writer()
file_paths = glob.glob('./subtitles/*.txt')
for file_path in file_paths:
with open(file_path) as f:
title = f.readline().strip()
videoId = f.readline().strip()
dubbed = f.readline().strip() == 'dubbed'
f.readline() # Skip first new line.
content = f.read()
writer.add_document(title=title, content=content, videoId=videoId, dubbed=dubbed)
writer.commit(optimize=True)