-
Notifications
You must be signed in to change notification settings - Fork 8
/
textgridParser.py
111 lines (90 loc) · 3.83 KB
/
textgridParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
import os
import textgrid as tgp
def textGrid2WordList(textgrid_file, whichTier = 'pinyin', utf16 = True):
'''
parse textGrid into a python list of tokens
@param whichTier : 'pinyin' default tier name
'''
if not os.path.isfile(textgrid_file): raise Exception("file {} not found".format(textgrid_file))
beginTsAndWordList = []
if utf16:
par_obj = tgp.TextGrid.loadUTF16(textgrid_file) #loading the object
else:
par_obj = tgp.TextGrid.load(textgrid_file) #loading the object
tiers= tgp.TextGrid._find_tiers(par_obj) #finding existing tiers
isTierFound = False
for tier in tiers:
tierName= tier.tier_name().replace('.','')
#iterating over tiers and selecting the one specified
if tierName == whichTier:
isTierFound = True
#this function parse the file nicely and return cool tuples
tier_details = tier.make_simple_transcript()
for line in tier_details:
beginTsAndWordList.append([float(line[0]), float(line[1]), line[2]])
if not isTierFound:
print ('Missing tier {1} in file {0}' .format(textgrid_file, whichTier))
return beginTsAndWordList
def line2WordList(line, entireWordList):
'''
find the nested wordList of entireWordList by line tuple
:param line: line tuple [startTime, endTime, string]
:param entireWordList: entire word list
:return: nested wordList
'''
nestedWordList = []
vault = False
for wordlist in entireWordList:
# the ending of the line
if wordlist[1] == line[1]:
nestedWordList.append(wordlist)
break
# the beginning of the line
if wordlist[0] == line[0]:
vault = True
if vault == True:
nestedWordList.append(wordlist)
return nestedWordList
def wordListsParseByLines(entireLine, entireWordList):
'''
find the wordList for each line, cut the word list according to line
:param entireLine: entire lines in line tier
:param entirewWordList: entire word lists in pinyin tier
:return:
nestedWordLists: [[line0, wordList0], [line1, wordList1], ...]
numLines: sum of number of lines
numWords: sum of number of words
'''
nestedWordLists = []
numLines = 0
numWords = 0
for line in entireLine:
asciiLine=line[2].encode("ascii", "replace")
if len(asciiLine.replace(" ", "")): # if line is not empty
numLines += 1
nestedWordList = []
wordList = line2WordList(line, entireWordList)
for word in wordList:
asciiWord = word[2].encode("ascii", "replace")
# if len(asciiWord.replace(" ","")): # if word is not empty
numWords += 1
nestedWordList.append(word)
nestedWordLists.append([line,nestedWordList])
return nestedWordLists, numLines, numWords
def syllableTextgridExtraction(textgrid_path, recording, tier0, tier1):
'''
Extract syllable boundary and phoneme boundary from textgrid
:param textgrid_path:
:param recording:
:param tier0: parent tier
:param tier1: child tier which should be covered by parent tier
:return:
nestedPhonemeList, element[0] - syllable, element[1] - a list containing the phoneme of the syllable
'''
textgrid_file = os.path.join(textgrid_path,recording+'.TextGrid')
syllableList = textGrid2WordList(textgrid_file, whichTier=tier0)
phonemeList = textGrid2WordList(textgrid_file, whichTier=tier1)
# parse syllables of groundtruth
nestedPhonemeLists, numSyllables, numPhonemes = wordListsParseByLines(syllableList, phonemeList)
return nestedPhonemeLists, numSyllables, numPhonemes