diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a5d3e18..cd6fedd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,14 +59,16 @@ Unreleased - Added Middle Korean (`okm`). (\#223) - Added Middle Irish (`mga`). (\#224) - Added Old Portuguese (`opt`). (\#225) +- Adds `tests/test_data` directory containing two tests. (\#226) - Adds Serbo-Croatian phoneme list and filtered TSV files. (\#227) - Added Tuvan (`tyv`). (\#228) - Added Shan (`shn`) with custom extraction. (\#229) ### Changed -- Specified UTF-8 encoding in handling text files. (\#221) - Renamed `.whitelist` file extension name as `.phones`. (\#207) +- Specified UTF-8 encoding in handling text files. (\#221) +- Moved previous contents of `tests` into `tests/test_wikipron` (\#226) ### Deprecated ### Removed diff --git a/data/__init__.py b/data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/src/__init__.py b/data/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/src/generate_summary.py b/data/src/generate_summary.py index 5a62d4a6..ec836f85 100755 --- a/data/src/generate_summary.py +++ b/data/src/generate_summary.py @@ -7,7 +7,7 @@ from typing import Any, Dict, List -from codes import LANGUAGES_PATH, README_PATH, LANGUAGES_SUMMARY_PATH +from data.src.codes import LANGUAGES_PATH, README_PATH, LANGUAGES_SUMMARY_PATH def _wiki_name_and_transcription_level(ele: List[str]) -> str: diff --git a/data/src/scrape.py b/data/src/scrape.py index 5b15a0df..750ea9fa 100755 --- a/data/src/scrape.py +++ b/data/src/scrape.py @@ -15,7 +15,7 @@ import wikipron # type: ignore -from codes import LANGUAGES_PATH, LOGGING_PATH +from data.src.codes import LANGUAGES_PATH, LOGGING_PATH def _phones_reader(path: str) -> Iterator[str]: @@ -90,7 +90,7 @@ def _call_scrape( def _build_scraping_config( - config_settings: Dict[str, Any], wiki_name: str, dialect_suffix: str = "" + config_settings: Dict[str, Any], dialect_suffix: str = "" ) -> None: path_affix = f'../tsv/{config_settings["key"]}_{dialect_suffix}' phones_path_affix = f"../phones/{config_settings['key']}_{dialect_suffix}" @@ -187,7 +187,7 @@ def main(args: argparse.Namespace) -> None: } if "dialect" not in language_settings: _build_scraping_config( - config_settings, language_settings["wiktionary_name"] + config_settings ) else: for (dialect_key, dialect_value) in language_settings[ @@ -196,7 +196,6 @@ def main(args: argparse.Namespace) -> None: config_settings["dialect"] = dialect_value _build_scraping_config( config_settings, - language_settings["wiktionary_name"], dialect_key + "_", ) diff --git a/tests/test_data/__init__.py b/tests/test_data/__init__.py new file mode 100644 index 00000000..7731788e --- /dev/null +++ b/tests/test_data/__init__.py @@ -0,0 +1,33 @@ +import os +import shutil + +from contextlib import contextmanager + +_TESTS_DIR = os.path.dirname(os.getcwd()) +_TSV_PATH = f"{_TESTS_DIR}/tsv" +_PHONES_PATH = f"{_TESTS_DIR}/phones" + + +def write_dummy_phones_files(key: str, dialect: str) -> None: + """Creates dummy .phones files in dummy phones directory.""" + with open( + f"{_PHONES_PATH}/{key}_{dialect}phonetic.phones", "w", encoding="utf-8" + ) as f1: + f1.write("a") + with open( + f"{_PHONES_PATH}/{key}_{dialect}phonemic.phones", "w", encoding="utf-8" + ) as f2: + f2.write("a") + + +@contextmanager +def handle_dummy_files(phones: bool, key: str, dialect: str) -> str: + """Creates and removes dummy directories for housing + TSV and phones files.""" + os.mkdir(_TSV_PATH) + os.mkdir(_PHONES_PATH) + if phones: + write_dummy_phones_files(key, dialect) + yield _TSV_PATH + shutil.rmtree(_TSV_PATH) + shutil.rmtree(_PHONES_PATH) diff --git a/tests/test_data/test_scrape.py b/tests/test_data/test_scrape.py new file mode 100644 index 00000000..b940ca16 --- /dev/null +++ b/tests/test_data/test_scrape.py @@ -0,0 +1,62 @@ +import os + +from typing import List + +import pytest + +from data.src.scrape import _build_scraping_config + +from . import handle_dummy_files + + +# "mul" should be a future-proof iso639 code to test with. +# "mul" is resolved to "Multiple Languages" by iso639 package, +# which is a non-existent category on Wikitionary. +# An alternative solution to using "mul" would be to add +# a code to languagecodes.py explicitly for the purposes of testing. +@pytest.mark.parametrize( + "config_settings, dialect_suffix, phones, expected_file_name", + [ + # Dialect and phones + ( + {"key": "mul"}, + "test_", + True, + [ + "mul_test_phonetic.tsv", + "mul_test_phonemic.tsv", + "mul_test_phonetic_filtered.tsv", + "mul_test_phonemic_filtered.tsv", + ], + ), + # Dialect + ( + {"key": "mul"}, + "test_", + False, + ["mul_test_phonetic.tsv", "mul_test_phonemic.tsv"], + ), + # Standard + ({"key": "mul"}, "", False, ["mul_phonetic.tsv", "mul_phonemic.tsv"],), + ], +) +def test_file_creation( + config_settings: object, + dialect_suffix: str, + phones: bool, + expected_file_name: List[str], +): + """Check whether _build_scraping_config() outputs TSVs with expected + file names based on presence or absence of dialect specification + or .phones files for a given language. + """ + with handle_dummy_files( + phones, config_settings["key"], dialect_suffix + ) as dummy_tsv_path: + _build_scraping_config( + config_settings=config_settings, dialect_suffix=dialect_suffix + ) + tsv_contents = os.listdir(dummy_tsv_path) + + for produced_file in tsv_contents: + assert produced_file in expected_file_name diff --git a/tests/test_data/test_summary.py b/tests/test_data/test_summary.py new file mode 100644 index 00000000..60eabd91 --- /dev/null +++ b/tests/test_data/test_summary.py @@ -0,0 +1,59 @@ +import os + +_REPO_DIR = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +) +_SUMMARY = os.path.join(_REPO_DIR, "data/languages_summary.tsv") +_TSV_DIRECTORY = os.path.join(_REPO_DIR, "data/tsv") + + +def test_summary_matches_language_data(): + """Check if each TSV referenced in data/languages_summary.tsv is + present in data/tsv. + + (Basically checks whether generate_summary.py has been run.) + """ + observed_name_to_count = {} + + for unique_tsv in os.listdir(_TSV_DIRECTORY): + with open( + f"{_TSV_DIRECTORY}/{unique_tsv}", "r", encoding="utf-8" + ) as tsv: + num_of_entries = sum(1 for line in tsv) + observed_name_to_count[unique_tsv] = num_of_entries + + with open(_SUMMARY, "r", encoding="utf-8") as lang_summary: + summary_files = [line.rstrip().split("\t")[0] for line in lang_summary] + + for summary_file in summary_files: + assert ( + summary_file in observed_name_to_count + ), f"{summary_file} in data/languages_summary.tsv but not in data/tsv" + + +def test_language_data_matches_summary(): + """Check if each TSV in data/tsv is present in data/languages_summary.tsv + and if the number of entries in each TSV matches its listed number + of entries in data/languages_summary.tsv. + + (Basically checks whether generate_summary.py has been run.) + """ + name_count_dict = {} + with open(_SUMMARY, "r", encoding="utf-8") as lang_summary: + vals = [line.rstrip().split("\t") for line in lang_summary] + for val in vals: + name_count_dict[val[0]] = int(val[-1]) + + for unique_tsv in os.listdir(_TSV_DIRECTORY): + with open( + f"{_TSV_DIRECTORY}/{unique_tsv}", "r", encoding="utf-8" + ) as tsv: + num_of_entries = sum(1 for line in tsv) + assert unique_tsv in name_count_dict, ( + f"{unique_tsv} in data/tsv but not in " + "data/languages_summary.tsv" + ) + assert name_count_dict[unique_tsv] == num_of_entries, ( + f"Number of entries in {unique_tsv} does not match " + "number of entries in data/languages_summary.tsv." + ) diff --git a/tests/__init__.py b/tests/test_wikipron/__init__.py similarity index 100% rename from tests/__init__.py rename to tests/test_wikipron/__init__.py diff --git a/tests/test_cli.py b/tests/test_wikipron/test_cli.py similarity index 100% rename from tests/test_cli.py rename to tests/test_wikipron/test_cli.py diff --git a/tests/test_config.py b/tests/test_wikipron/test_config.py similarity index 100% rename from tests/test_config.py rename to tests/test_wikipron/test_config.py diff --git a/tests/test_extract.py b/tests/test_wikipron/test_extract.py similarity index 100% rename from tests/test_extract.py rename to tests/test_wikipron/test_extract.py diff --git a/tests/test_languagecodes.py b/tests/test_wikipron/test_languagecodes.py similarity index 100% rename from tests/test_languagecodes.py rename to tests/test_wikipron/test_languagecodes.py diff --git a/tests/test_scrape.py b/tests/test_wikipron/test_scrape.py similarity index 100% rename from tests/test_scrape.py rename to tests/test_wikipron/test_scrape.py diff --git a/tests/test_version.py b/tests/test_wikipron/test_version.py similarity index 79% rename from tests/test_version.py rename to tests/test_wikipron/test_version.py index fdd20319..804b56d0 100644 --- a/tests/test_version.py +++ b/tests/test_wikipron/test_version.py @@ -4,14 +4,15 @@ import wikipron -_REPO_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +_REPO_DIR = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +) def test_version_number_match_with_changelog(): """__version__ and CHANGELOG.md match for the latest version number.""" changelog = open( - os.path.join(_REPO_DIR, "CHANGELOG.md"), - encoding="utf-8", + os.path.join(_REPO_DIR, "CHANGELOG.md"), encoding="utf-8", ).read() # latest version number in changelog = the 1st occurrence of '[x.y.z]' version_in_changelog = (