-
Notifications
You must be signed in to change notification settings - Fork 71
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Reorganizes tests and adds a few initial tests for the data side (#226)
* moves wikipron module tests into subdirectory * reformating of test_version.py * adds outline of test for data naming conventions, removes nonsense from src/scrape.py * basic framework for testing file creation involved in big scrape * renamed file naming test and added comments * reorganizes tests directory, adds test for generate_summary.py * fix formating in test_version.py * revises and renames file for testing scrape * fixes pathing issue in init * adds some typing to new tests * changes open statements to use proper encoding * potential solution to circleci module error * approaching a circleci import solution? * updates changelog
- Loading branch information
Showing
15 changed files
with
165 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import os | ||
import shutil | ||
|
||
from contextlib import contextmanager | ||
|
||
_TESTS_DIR = os.path.dirname(os.getcwd()) | ||
_TSV_PATH = f"{_TESTS_DIR}/tsv" | ||
_PHONES_PATH = f"{_TESTS_DIR}/phones" | ||
|
||
|
||
def write_dummy_phones_files(key: str, dialect: str) -> None: | ||
"""Creates dummy .phones files in dummy phones directory.""" | ||
with open( | ||
f"{_PHONES_PATH}/{key}_{dialect}phonetic.phones", "w", encoding="utf-8" | ||
) as f1: | ||
f1.write("a") | ||
with open( | ||
f"{_PHONES_PATH}/{key}_{dialect}phonemic.phones", "w", encoding="utf-8" | ||
) as f2: | ||
f2.write("a") | ||
|
||
|
||
@contextmanager | ||
def handle_dummy_files(phones: bool, key: str, dialect: str) -> str: | ||
"""Creates and removes dummy directories for housing | ||
TSV and phones files.""" | ||
os.mkdir(_TSV_PATH) | ||
os.mkdir(_PHONES_PATH) | ||
if phones: | ||
write_dummy_phones_files(key, dialect) | ||
yield _TSV_PATH | ||
shutil.rmtree(_TSV_PATH) | ||
shutil.rmtree(_PHONES_PATH) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import os | ||
|
||
from typing import List | ||
|
||
import pytest | ||
|
||
from data.src.scrape import _build_scraping_config | ||
|
||
from . import handle_dummy_files | ||
|
||
|
||
# "mul" should be a future-proof iso639 code to test with. | ||
# "mul" is resolved to "Multiple Languages" by iso639 package, | ||
# which is a non-existent category on Wikitionary. | ||
# An alternative solution to using "mul" would be to add | ||
# a code to languagecodes.py explicitly for the purposes of testing. | ||
@pytest.mark.parametrize( | ||
"config_settings, dialect_suffix, phones, expected_file_name", | ||
[ | ||
# Dialect and phones | ||
( | ||
{"key": "mul"}, | ||
"test_", | ||
True, | ||
[ | ||
"mul_test_phonetic.tsv", | ||
"mul_test_phonemic.tsv", | ||
"mul_test_phonetic_filtered.tsv", | ||
"mul_test_phonemic_filtered.tsv", | ||
], | ||
), | ||
# Dialect | ||
( | ||
{"key": "mul"}, | ||
"test_", | ||
False, | ||
["mul_test_phonetic.tsv", "mul_test_phonemic.tsv"], | ||
), | ||
# Standard | ||
({"key": "mul"}, "", False, ["mul_phonetic.tsv", "mul_phonemic.tsv"],), | ||
], | ||
) | ||
def test_file_creation( | ||
config_settings: object, | ||
dialect_suffix: str, | ||
phones: bool, | ||
expected_file_name: List[str], | ||
): | ||
"""Check whether _build_scraping_config() outputs TSVs with expected | ||
file names based on presence or absence of dialect specification | ||
or .phones files for a given language. | ||
""" | ||
with handle_dummy_files( | ||
phones, config_settings["key"], dialect_suffix | ||
) as dummy_tsv_path: | ||
_build_scraping_config( | ||
config_settings=config_settings, dialect_suffix=dialect_suffix | ||
) | ||
tsv_contents = os.listdir(dummy_tsv_path) | ||
|
||
for produced_file in tsv_contents: | ||
assert produced_file in expected_file_name |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import os | ||
|
||
_REPO_DIR = os.path.dirname( | ||
os.path.dirname(os.path.dirname(os.path.realpath(__file__))) | ||
) | ||
_SUMMARY = os.path.join(_REPO_DIR, "data/languages_summary.tsv") | ||
_TSV_DIRECTORY = os.path.join(_REPO_DIR, "data/tsv") | ||
|
||
|
||
def test_summary_matches_language_data(): | ||
"""Check if each TSV referenced in data/languages_summary.tsv is | ||
present in data/tsv. | ||
(Basically checks whether generate_summary.py has been run.) | ||
""" | ||
observed_name_to_count = {} | ||
|
||
for unique_tsv in os.listdir(_TSV_DIRECTORY): | ||
with open( | ||
f"{_TSV_DIRECTORY}/{unique_tsv}", "r", encoding="utf-8" | ||
) as tsv: | ||
num_of_entries = sum(1 for line in tsv) | ||
observed_name_to_count[unique_tsv] = num_of_entries | ||
|
||
with open(_SUMMARY, "r", encoding="utf-8") as lang_summary: | ||
summary_files = [line.rstrip().split("\t")[0] for line in lang_summary] | ||
|
||
for summary_file in summary_files: | ||
assert ( | ||
summary_file in observed_name_to_count | ||
), f"{summary_file} in data/languages_summary.tsv but not in data/tsv" | ||
|
||
|
||
def test_language_data_matches_summary(): | ||
"""Check if each TSV in data/tsv is present in data/languages_summary.tsv | ||
and if the number of entries in each TSV matches its listed number | ||
of entries in data/languages_summary.tsv. | ||
(Basically checks whether generate_summary.py has been run.) | ||
""" | ||
name_count_dict = {} | ||
with open(_SUMMARY, "r", encoding="utf-8") as lang_summary: | ||
vals = [line.rstrip().split("\t") for line in lang_summary] | ||
for val in vals: | ||
name_count_dict[val[0]] = int(val[-1]) | ||
|
||
for unique_tsv in os.listdir(_TSV_DIRECTORY): | ||
with open( | ||
f"{_TSV_DIRECTORY}/{unique_tsv}", "r", encoding="utf-8" | ||
) as tsv: | ||
num_of_entries = sum(1 for line in tsv) | ||
assert unique_tsv in name_count_dict, ( | ||
f"{unique_tsv} in data/tsv but not in " | ||
"data/languages_summary.tsv" | ||
) | ||
assert name_count_dict[unique_tsv] == num_of_entries, ( | ||
f"Number of entries in {unique_tsv} does not match " | ||
"number of entries in data/languages_summary.tsv." | ||
) |
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters