Skip to content

Commit

Permalink
Handle TSVs with header rows
Browse files Browse the repository at this point in the history
Resolves #65 by checking the data type of the first time field. If it's not a float, we assume it's a header row and remove it from the returned list. Otherwise the function returns as previously.

Squashed commit of DerMoehre's PR #73

Co-authored-by: JoFrhwld <JoFrhwld@gmail.com>
Co-authored-by: Christian Brickhouse <chrisbrickhouse@users.noreply.github.com>
  • Loading branch information
3 people authored Oct 6, 2022
1 parent 5e68946 commit ce4ee71
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 2 deletions.
8 changes: 7 additions & 1 deletion fave/align/transcriptprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,13 @@ def read_transcription_file(self):
"""Reads file into memory"""
with open(self.file) as f:
lines = self.replace_smart_quotes(f.readlines())
self.lines = lines
self.lines = lines
try:
float(lines[0].split('\t')[2])
except ValueError:
# Log a warning about having detected a header row
self.logger.warning('Header row was detected')
del lines[0]

# substitute any 'smart' quotes in the input file with the corresponding
# ASCII equivalents (otherwise they will be excluded as out-of-
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "fave"
version = "2.0.2-dev"
version = "2.0.3-dev"
description = "Forced alignment and vowel extraction"
authors = [
"Ingrid Rosenfelder",
Expand Down Expand Up @@ -35,3 +35,5 @@ build-backend = "poetry.masonry.api"
[tool.poetry.scripts]
fave-extract = "fave.extractFormants:main"
fave-align = "fave.FAAValign:setup"
extractFormants = "fave.extractFormants:main"
FAAValign = "fave.FAAValign:setup"
76 changes: 76 additions & 0 deletions tests/fave/align/test_transcriptprocessor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,27 @@
import logging
import pytest
from fave.align import transcriptprocessor
from fave import cmudictionary # We shouldn't be doing this...

# Copied from ../test_cmudictionary.py
# which means this really should be made a fixture...
KWARGS = {
'verbose': 1
}

CMU_EXCERPT = """
TEST T EH1 S T
TEST'S T EH1 S T S
TESTA T EH1 S T AH0
TESTAMENT T EH1 S T AH0 M AH0 N T
TESTAMENTARY T EH2 S T AH0 M EH1 N T ER0 IY0
TESTED T EH1 S T AH0 D
TESTER T EH1 S T ER0
TESTERMAN T EH1 S T ER0 M AH0 N
TESTERS T EH1 S T ER0 Z
TESTERS T EH1 S T AH0 Z
"""


def test_replace_smart_quotes():
def test_func( testcase ):
Expand Down Expand Up @@ -69,3 +91,57 @@ def provide_check_transcription_format_raises_value_error():
# Skip 5 entries (not an error)
[ 'a\tb\tc\td\te\tf', ValueError], # 6 entries
]

def test_read_transcription_file(tmp_path):
tmp_directory = tmp_path / "transcripts"
tmp_directory.mkdir()
tmp_file = tmp_directory / "test_transcript.csv"
dict_file = tmp_directory / "cmu.dict"
dict_file.write_text(CMU_EXCERPT)
cmu_dict = cmudictionary.CMU_Dictionary(dict_file, **KWARGS)
for test_case in provide_value_error_file():
test_text = test_case[0]
flags = test_case[1]
expected = test_case[2]
tmp_file.write_text(test_text)
tp_obj = transcriptprocessor.TranscriptProcessor(
tmp_file,
cmu_dict,
**flags
)
tp_obj.read_transcription_file()

assert tp_obj.lines == expected

def provide_value_error_file():
return [
[ # header row is detected and deleted
"Style\tSpeaker\tBeginning\tEnd\tDuration\nFoo\tBar\t0.0\t3.2\t3.2",
{
'prompt': "IDK what this is -CJB",
'check' : '',
'verbose': logging.DEBUG
},
['Foo\tBar\t0.0\t3.2\t3.2']
],
[ # test with one line
"Foo\tBar\t0.0\t3.2\t3.2\nTest\t1.0\t4.5\t3.5",
{
'prompt': "IDK what this is -CJB",
'check' : '',
'verbose': logging.DEBUG
},
['Foo\tBar\t0.0\t3.2\t3.2\n', 'Test\t1.0\t4.5\t3.5']
],
[ # test with more lines
"Foo\tBar\t0.0\t3.2\t3.2\nTest\t1.0\t4.5\t3.5\nTest\t1.0\t4.5\t3.5",
{
'prompt': "IDK what this is -CJB",
'check' : '',
'verbose': logging.DEBUG
},
['Foo\tBar\t0.0\t3.2\t3.2\n', 'Test\t1.0\t4.5\t3.5\n', 'Test\t1.0\t4.5\t3.5']
]

]

0 comments on commit ce4ee71

Please sign in to comment.