Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reorganizes tests and adds a few initial tests for the data side #226

Merged
merged 16 commits into from
Oct 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,16 @@ Unreleased
- Added Middle Korean (`okm`). (\#223)
- Added Middle Irish (`mga`). (\#224)
- Added Old Portuguese (`opt`). (\#225)
- Adds `tests/test_data` directory containing two tests. (\#226)
- Adds Serbo-Croatian phoneme list and filtered TSV files. (\#227)
- Added Tuvan (`tyv`). (\#228)
- Added Shan (`shn`) with custom extraction. (\#229)

### Changed

- Specified UTF-8 encoding in handling text files. (\#221)
- Renamed `.whitelist` file extension name as `.phones`. (\#207)
- Specified UTF-8 encoding in handling text files. (\#221)
- Moved previous contents of `tests` into `tests/test_wikipron` (\#226)

### Deprecated
### Removed
Expand Down
Empty file added data/__init__.py
Empty file.
Empty file added data/src/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion data/src/generate_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from typing import Any, Dict, List

from codes import LANGUAGES_PATH, README_PATH, LANGUAGES_SUMMARY_PATH
from data.src.codes import LANGUAGES_PATH, README_PATH, LANGUAGES_SUMMARY_PATH


def _wiki_name_and_transcription_level(ele: List[str]) -> str:
Expand Down
7 changes: 3 additions & 4 deletions data/src/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import wikipron # type: ignore


from codes import LANGUAGES_PATH, LOGGING_PATH
from data.src.codes import LANGUAGES_PATH, LOGGING_PATH


def _phones_reader(path: str) -> Iterator[str]:
Expand Down Expand Up @@ -90,7 +90,7 @@ def _call_scrape(


def _build_scraping_config(
config_settings: Dict[str, Any], wiki_name: str, dialect_suffix: str = ""
config_settings: Dict[str, Any], dialect_suffix: str = ""
) -> None:
path_affix = f'../tsv/{config_settings["key"]}_{dialect_suffix}'
phones_path_affix = f"../phones/{config_settings['key']}_{dialect_suffix}"
Expand Down Expand Up @@ -187,7 +187,7 @@ def main(args: argparse.Namespace) -> None:
}
if "dialect" not in language_settings:
_build_scraping_config(
config_settings, language_settings["wiktionary_name"]
config_settings
)
else:
for (dialect_key, dialect_value) in language_settings[
Expand All @@ -196,7 +196,6 @@ def main(args: argparse.Namespace) -> None:
config_settings["dialect"] = dialect_value
_build_scraping_config(
config_settings,
language_settings["wiktionary_name"],
dialect_key + "_",
)

Expand Down
33 changes: 33 additions & 0 deletions tests/test_data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this just a file moved from a different directory?

Is it common for this sort of thing to go in an __init__.py file? It seems like that's usually just reserved for aliasing and imports. One alternative would be to have something called like test_setup.py or something in this directory and have __init__.py pull it in.

import shutil

from contextlib import contextmanager

_TESTS_DIR = os.path.dirname(os.getcwd())
_TSV_PATH = f"{_TESTS_DIR}/tsv"
_PHONES_PATH = f"{_TESTS_DIR}/phones"


def write_dummy_phones_files(key: str, dialect: str) -> None:
"""Creates dummy .phones files in dummy phones directory."""
with open(
f"{_PHONES_PATH}/{key}_{dialect}phonetic.phones", "w", encoding="utf-8"
) as f1:
f1.write("a")
with open(
f"{_PHONES_PATH}/{key}_{dialect}phonemic.phones", "w", encoding="utf-8"
) as f2:
f2.write("a")


@contextmanager
def handle_dummy_files(phones: bool, key: str, dialect: str) -> str:
"""Creates and removes dummy directories for housing
TSV and phones files."""
os.mkdir(_TSV_PATH)
os.mkdir(_PHONES_PATH)
if phones:
write_dummy_phones_files(key, dialect)
yield _TSV_PATH
shutil.rmtree(_TSV_PATH)
shutil.rmtree(_PHONES_PATH)
kylebgorman marked this conversation as resolved.
Show resolved Hide resolved
62 changes: 62 additions & 0 deletions tests/test_data/test_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os

from typing import List

import pytest

from data.src.scrape import _build_scraping_config

from . import handle_dummy_files


# "mul" should be a future-proof iso639 code to test with.
# "mul" is resolved to "Multiple Languages" by iso639 package,
# which is a non-existent category on Wikitionary.
# An alternative solution to using "mul" would be to add
# a code to languagecodes.py explicitly for the purposes of testing.
@pytest.mark.parametrize(
"config_settings, dialect_suffix, phones, expected_file_name",
[
# Dialect and phones
(
{"key": "mul"},
"test_",
True,
[
"mul_test_phonetic.tsv",
"mul_test_phonemic.tsv",
"mul_test_phonetic_filtered.tsv",
"mul_test_phonemic_filtered.tsv",
],
),
# Dialect
(
{"key": "mul"},
"test_",
False,
["mul_test_phonetic.tsv", "mul_test_phonemic.tsv"],
),
# Standard
({"key": "mul"}, "", False, ["mul_phonetic.tsv", "mul_phonemic.tsv"],),
],
)
def test_file_creation(
config_settings: object,
dialect_suffix: str,
phones: bool,
expected_file_name: List[str],
):
"""Check whether _build_scraping_config() outputs TSVs with expected
file names based on presence or absence of dialect specification
or .phones files for a given language.
"""
with handle_dummy_files(
phones, config_settings["key"], dialect_suffix
) as dummy_tsv_path:
_build_scraping_config(
config_settings=config_settings, dialect_suffix=dialect_suffix
)
tsv_contents = os.listdir(dummy_tsv_path)
kylebgorman marked this conversation as resolved.
Show resolved Hide resolved

for produced_file in tsv_contents:
assert produced_file in expected_file_name
59 changes: 59 additions & 0 deletions tests/test_data/test_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os

_REPO_DIR = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
)
_SUMMARY = os.path.join(_REPO_DIR, "data/languages_summary.tsv")
_TSV_DIRECTORY = os.path.join(_REPO_DIR, "data/tsv")


def test_summary_matches_language_data():
"""Check if each TSV referenced in data/languages_summary.tsv is
present in data/tsv.

(Basically checks whether generate_summary.py has been run.)
"""
observed_name_to_count = {}

for unique_tsv in os.listdir(_TSV_DIRECTORY):
with open(
f"{_TSV_DIRECTORY}/{unique_tsv}", "r", encoding="utf-8"
) as tsv:
num_of_entries = sum(1 for line in tsv)
observed_name_to_count[unique_tsv] = num_of_entries

with open(_SUMMARY, "r", encoding="utf-8") as lang_summary:
summary_files = [line.rstrip().split("\t")[0] for line in lang_summary]

for summary_file in summary_files:
assert (
summary_file in observed_name_to_count
), f"{summary_file} in data/languages_summary.tsv but not in data/tsv"


def test_language_data_matches_summary():
"""Check if each TSV in data/tsv is present in data/languages_summary.tsv
and if the number of entries in each TSV matches its listed number
of entries in data/languages_summary.tsv.

(Basically checks whether generate_summary.py has been run.)
"""
name_count_dict = {}
with open(_SUMMARY, "r", encoding="utf-8") as lang_summary:
vals = [line.rstrip().split("\t") for line in lang_summary]
for val in vals:
name_count_dict[val[0]] = int(val[-1])

for unique_tsv in os.listdir(_TSV_DIRECTORY):
with open(
f"{_TSV_DIRECTORY}/{unique_tsv}", "r", encoding="utf-8"
) as tsv:
num_of_entries = sum(1 for line in tsv)
assert unique_tsv in name_count_dict, (
f"{unique_tsv} in data/tsv but not in "
"data/languages_summary.tsv"
)
assert name_count_dict[unique_tsv] == num_of_entries, (
f"Number of entries in {unique_tsv} does not match "
"number of entries in data/languages_summary.tsv."
)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
7 changes: 4 additions & 3 deletions tests/test_version.py → tests/test_wikipron/test_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
import wikipron


_REPO_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
_REPO_DIR = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
)


def test_version_number_match_with_changelog():
"""__version__ and CHANGELOG.md match for the latest version number."""
changelog = open(
os.path.join(_REPO_DIR, "CHANGELOG.md"),
encoding="utf-8",
os.path.join(_REPO_DIR, "CHANGELOG.md"), encoding="utf-8",
).read()
# latest version number in changelog = the 1st occurrence of '[x.y.z]'
version_in_changelog = (
Expand Down