Skip to content

Commit

Permalink
Reorganizes tests and adds a few initial tests for the data side (#226)
Browse files Browse the repository at this point in the history
* moves wikipron module tests into subdirectory

* reformating of test_version.py

* adds outline of test for data naming conventions, removes nonsense from src/scrape.py

* basic framework for testing file creation involved in big scrape

* renamed file naming test and added comments

* reorganizes tests directory, adds test for generate_summary.py

* fix formating in test_version.py

* revises and renames file for testing scrape

* fixes pathing issue in init

* adds some typing to new tests

* changes open statements to use proper encoding

* potential solution to circleci module error

* approaching a circleci import solution?

* updates changelog
  • Loading branch information
lfashby authored Oct 13, 2020
1 parent ad6f2c9 commit 18256d9
Show file tree
Hide file tree
Showing 15 changed files with 165 additions and 9 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,16 @@ Unreleased
- Added Middle Korean (`okm`). (\#223)
- Added Middle Irish (`mga`). (\#224)
- Added Old Portuguese (`opt`). (\#225)
- Adds `tests/test_data` directory containing two tests. (\#226)
- Adds Serbo-Croatian phoneme list and filtered TSV files. (\#227)
- Added Tuvan (`tyv`). (\#228)
- Added Shan (`shn`) with custom extraction. (\#229)

### Changed

- Specified UTF-8 encoding in handling text files. (\#221)
- Renamed `.whitelist` file extension name as `.phones`. (\#207)
- Specified UTF-8 encoding in handling text files. (\#221)
- Moved previous contents of `tests` into `tests/test_wikipron` (\#226)

### Deprecated
### Removed
Expand Down
Empty file added data/__init__.py
Empty file.
Empty file added data/src/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion data/src/generate_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from typing import Any, Dict, List

from codes import LANGUAGES_PATH, README_PATH, LANGUAGES_SUMMARY_PATH
from data.src.codes import LANGUAGES_PATH, README_PATH, LANGUAGES_SUMMARY_PATH


def _wiki_name_and_transcription_level(ele: List[str]) -> str:
Expand Down
7 changes: 3 additions & 4 deletions data/src/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import wikipron # type: ignore


from codes import LANGUAGES_PATH, LOGGING_PATH
from data.src.codes import LANGUAGES_PATH, LOGGING_PATH


def _phones_reader(path: str) -> Iterator[str]:
Expand Down Expand Up @@ -90,7 +90,7 @@ def _call_scrape(


def _build_scraping_config(
config_settings: Dict[str, Any], wiki_name: str, dialect_suffix: str = ""
config_settings: Dict[str, Any], dialect_suffix: str = ""
) -> None:
path_affix = f'../tsv/{config_settings["key"]}_{dialect_suffix}'
phones_path_affix = f"../phones/{config_settings['key']}_{dialect_suffix}"
Expand Down Expand Up @@ -187,7 +187,7 @@ def main(args: argparse.Namespace) -> None:
}
if "dialect" not in language_settings:
_build_scraping_config(
config_settings, language_settings["wiktionary_name"]
config_settings
)
else:
for (dialect_key, dialect_value) in language_settings[
Expand All @@ -196,7 +196,6 @@ def main(args: argparse.Namespace) -> None:
config_settings["dialect"] = dialect_value
_build_scraping_config(
config_settings,
language_settings["wiktionary_name"],
dialect_key + "_",
)

Expand Down
33 changes: 33 additions & 0 deletions tests/test_data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import shutil

from contextlib import contextmanager

_TESTS_DIR = os.path.dirname(os.getcwd())
_TSV_PATH = f"{_TESTS_DIR}/tsv"
_PHONES_PATH = f"{_TESTS_DIR}/phones"


def write_dummy_phones_files(key: str, dialect: str) -> None:
"""Creates dummy .phones files in dummy phones directory."""
with open(
f"{_PHONES_PATH}/{key}_{dialect}phonetic.phones", "w", encoding="utf-8"
) as f1:
f1.write("a")
with open(
f"{_PHONES_PATH}/{key}_{dialect}phonemic.phones", "w", encoding="utf-8"
) as f2:
f2.write("a")


@contextmanager
def handle_dummy_files(phones: bool, key: str, dialect: str) -> str:
"""Creates and removes dummy directories for housing
TSV and phones files."""
os.mkdir(_TSV_PATH)
os.mkdir(_PHONES_PATH)
if phones:
write_dummy_phones_files(key, dialect)
yield _TSV_PATH
shutil.rmtree(_TSV_PATH)
shutil.rmtree(_PHONES_PATH)
62 changes: 62 additions & 0 deletions tests/test_data/test_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os

from typing import List

import pytest

from data.src.scrape import _build_scraping_config

from . import handle_dummy_files


# "mul" should be a future-proof iso639 code to test with.
# "mul" is resolved to "Multiple Languages" by iso639 package,
# which is a non-existent category on Wikitionary.
# An alternative solution to using "mul" would be to add
# a code to languagecodes.py explicitly for the purposes of testing.
@pytest.mark.parametrize(
"config_settings, dialect_suffix, phones, expected_file_name",
[
# Dialect and phones
(
{"key": "mul"},
"test_",
True,
[
"mul_test_phonetic.tsv",
"mul_test_phonemic.tsv",
"mul_test_phonetic_filtered.tsv",
"mul_test_phonemic_filtered.tsv",
],
),
# Dialect
(
{"key": "mul"},
"test_",
False,
["mul_test_phonetic.tsv", "mul_test_phonemic.tsv"],
),
# Standard
({"key": "mul"}, "", False, ["mul_phonetic.tsv", "mul_phonemic.tsv"],),
],
)
def test_file_creation(
config_settings: object,
dialect_suffix: str,
phones: bool,
expected_file_name: List[str],
):
"""Check whether _build_scraping_config() outputs TSVs with expected
file names based on presence or absence of dialect specification
or .phones files for a given language.
"""
with handle_dummy_files(
phones, config_settings["key"], dialect_suffix
) as dummy_tsv_path:
_build_scraping_config(
config_settings=config_settings, dialect_suffix=dialect_suffix
)
tsv_contents = os.listdir(dummy_tsv_path)

for produced_file in tsv_contents:
assert produced_file in expected_file_name
59 changes: 59 additions & 0 deletions tests/test_data/test_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os

_REPO_DIR = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
)
_SUMMARY = os.path.join(_REPO_DIR, "data/languages_summary.tsv")
_TSV_DIRECTORY = os.path.join(_REPO_DIR, "data/tsv")


def test_summary_matches_language_data():
"""Check if each TSV referenced in data/languages_summary.tsv is
present in data/tsv.
(Basically checks whether generate_summary.py has been run.)
"""
observed_name_to_count = {}

for unique_tsv in os.listdir(_TSV_DIRECTORY):
with open(
f"{_TSV_DIRECTORY}/{unique_tsv}", "r", encoding="utf-8"
) as tsv:
num_of_entries = sum(1 for line in tsv)
observed_name_to_count[unique_tsv] = num_of_entries

with open(_SUMMARY, "r", encoding="utf-8") as lang_summary:
summary_files = [line.rstrip().split("\t")[0] for line in lang_summary]

for summary_file in summary_files:
assert (
summary_file in observed_name_to_count
), f"{summary_file} in data/languages_summary.tsv but not in data/tsv"


def test_language_data_matches_summary():
"""Check if each TSV in data/tsv is present in data/languages_summary.tsv
and if the number of entries in each TSV matches its listed number
of entries in data/languages_summary.tsv.
(Basically checks whether generate_summary.py has been run.)
"""
name_count_dict = {}
with open(_SUMMARY, "r", encoding="utf-8") as lang_summary:
vals = [line.rstrip().split("\t") for line in lang_summary]
for val in vals:
name_count_dict[val[0]] = int(val[-1])

for unique_tsv in os.listdir(_TSV_DIRECTORY):
with open(
f"{_TSV_DIRECTORY}/{unique_tsv}", "r", encoding="utf-8"
) as tsv:
num_of_entries = sum(1 for line in tsv)
assert unique_tsv in name_count_dict, (
f"{unique_tsv} in data/tsv but not in "
"data/languages_summary.tsv"
)
assert name_count_dict[unique_tsv] == num_of_entries, (
f"Number of entries in {unique_tsv} does not match "
"number of entries in data/languages_summary.tsv."
)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
7 changes: 4 additions & 3 deletions tests/test_version.py → tests/test_wikipron/test_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
import wikipron


_REPO_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
_REPO_DIR = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
)


def test_version_number_match_with_changelog():
"""__version__ and CHANGELOG.md match for the latest version number."""
changelog = open(
os.path.join(_REPO_DIR, "CHANGELOG.md"),
encoding="utf-8",
os.path.join(_REPO_DIR, "CHANGELOG.md"), encoding="utf-8",
).read()
# latest version number in changelog = the 1st occurrence of '[x.y.z]'
version_in_changelog = (
Expand Down

0 comments on commit 18256d9

Please sign in to comment.