Reorganizes tests and adds a few initial tests for the data side (#226)

* moves wikipron module tests into subdirectory * reformating of test_version.py * adds outline of test for data naming conventions, removes nonsense from src/scrape.py * basic framework for testing file creation involved in big scrape * renamed file naming test and added comments * reorganizes tests directory, adds test for generate_summary.py * fix formating in test_version.py * revises and renames file for testing scrape * fixes pathing issue in init * adds some typing to new tests * changes open statements to use proper encoding * potential solution to circleci module error * approaching a circleci import solution? * updates changelog
CUNY-CL · Oct 13, 2020 · 18256d9 · 18256d9
1 parent ad6f2c9
commit 18256d9
Show file tree

Hide file tree

Showing 15 changed files with 165 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -59,14 +59,16 @@ Unreleased
 -   Added Middle Korean (`okm`). (\#223)
 -   Added Middle Irish (`mga`). (\#224)
 -   Added Old Portuguese (`opt`). (\#225)
+-   Adds `tests/test_data` directory containing two tests. (\#226)
 -   Adds Serbo-Croatian phoneme list and filtered TSV files. (\#227)
 -   Added Tuvan (`tyv`). (\#228)
 -   Added Shan (`shn`) with custom extraction. (\#229)
 
 ### Changed
 
--   Specified UTF-8 encoding in handling text files. (\#221)
 -   Renamed `.whitelist` file extension name as `.phones`. (\#207)
+-   Specified UTF-8 encoding in handling text files. (\#221)
+-   Moved previous contents of `tests` into `tests/test_wikipron` (\#226)
 
 ### Deprecated
 ### Removed

diff --git a/data/__init__.py b/data/__init__.py
diff --git a/data/src/__init__.py b/data/src/__init__.py
diff --git a/data/src/generate_summary.py b/data/src/generate_summary.py
@@ -7,7 +7,7 @@
 
 from typing import Any, Dict, List
 
-from codes import LANGUAGES_PATH, README_PATH, LANGUAGES_SUMMARY_PATH
+from data.src.codes import LANGUAGES_PATH, README_PATH, LANGUAGES_SUMMARY_PATH
 
 
 def _wiki_name_and_transcription_level(ele: List[str]) -> str:

diff --git a/data/src/scrape.py b/data/src/scrape.py
@@ -15,7 +15,7 @@
 import wikipron  # type: ignore
 
 
-from codes import LANGUAGES_PATH, LOGGING_PATH
+from data.src.codes import LANGUAGES_PATH, LOGGING_PATH
 
 
 def _phones_reader(path: str) -> Iterator[str]:
@@ -90,7 +90,7 @@ def _call_scrape(
 
 
 def _build_scraping_config(
-    config_settings: Dict[str, Any], wiki_name: str, dialect_suffix: str = ""
+    config_settings: Dict[str, Any], dialect_suffix: str = ""
 ) -> None:
     path_affix = f'../tsv/{config_settings["key"]}_{dialect_suffix}'
     phones_path_affix = f"../phones/{config_settings['key']}_{dialect_suffix}"
@@ -187,7 +187,7 @@ def main(args: argparse.Namespace) -> None:
         }
         if "dialect" not in language_settings:
             _build_scraping_config(
-                config_settings, language_settings["wiktionary_name"]
+                config_settings
             )
         else:
             for (dialect_key, dialect_value) in language_settings[
@@ -196,7 +196,6 @@ def main(args: argparse.Namespace) -> None:
                 config_settings["dialect"] = dialect_value
                 _build_scraping_config(
                     config_settings,
-                    language_settings["wiktionary_name"],
                     dialect_key + "_",
                 )
 

diff --git a/tests/test_data/__init__.py b/tests/test_data/__init__.py
@@ -0,0 +1,33 @@
+import os
+import shutil
+
+from contextlib import contextmanager
+
+_TESTS_DIR = os.path.dirname(os.getcwd())
+_TSV_PATH = f"{_TESTS_DIR}/tsv"
+_PHONES_PATH = f"{_TESTS_DIR}/phones"
+
+
+def write_dummy_phones_files(key: str, dialect: str) -> None:
+    """Creates dummy .phones files in dummy phones directory."""
+    with open(
+        f"{_PHONES_PATH}/{key}_{dialect}phonetic.phones", "w", encoding="utf-8"
+    ) as f1:
+        f1.write("a")
+    with open(
+        f"{_PHONES_PATH}/{key}_{dialect}phonemic.phones", "w", encoding="utf-8"
+    ) as f2:
+        f2.write("a")
+
+
+@contextmanager
+def handle_dummy_files(phones: bool, key: str, dialect: str) -> str:
+    """Creates and removes dummy directories for housing
+    TSV and phones files."""
+    os.mkdir(_TSV_PATH)
+    os.mkdir(_PHONES_PATH)
+    if phones:
+        write_dummy_phones_files(key, dialect)
+    yield _TSV_PATH
+    shutil.rmtree(_TSV_PATH)
+    shutil.rmtree(_PHONES_PATH)
diff --git a/tests/test_data/test_scrape.py b/tests/test_data/test_scrape.py
@@ -0,0 +1,62 @@
+import os
+
+from typing import List
+
+import pytest
+
+from data.src.scrape import _build_scraping_config
+
+from . import handle_dummy_files
+
+
+# "mul" should be a future-proof iso639 code to test with.
+# "mul" is resolved to "Multiple Languages" by iso639 package,
+# which is a non-existent category on Wikitionary.
+# An alternative solution to using "mul" would be to add
+# a code to languagecodes.py explicitly for the purposes of testing.
+@pytest.mark.parametrize(
+    "config_settings, dialect_suffix, phones, expected_file_name",
+    [
+        # Dialect and phones
+        (
+            {"key": "mul"},
+            "test_",
+            True,
+            [
+                "mul_test_phonetic.tsv",
+                "mul_test_phonemic.tsv",
+                "mul_test_phonetic_filtered.tsv",
+                "mul_test_phonemic_filtered.tsv",
+            ],
+        ),
+        # Dialect
+        (
+            {"key": "mul"},
+            "test_",
+            False,
+            ["mul_test_phonetic.tsv", "mul_test_phonemic.tsv"],
+        ),
+        # Standard
+        ({"key": "mul"}, "", False, ["mul_phonetic.tsv", "mul_phonemic.tsv"],),
+    ],
+)
+def test_file_creation(
+    config_settings: object,
+    dialect_suffix: str,
+    phones: bool,
+    expected_file_name: List[str],
+):
+    """Check whether _build_scraping_config() outputs TSVs with expected
+    file names based on presence or absence of dialect specification
+    or .phones files for a given language.
+    """
+    with handle_dummy_files(
+        phones, config_settings["key"], dialect_suffix
+    ) as dummy_tsv_path:
+        _build_scraping_config(
+            config_settings=config_settings, dialect_suffix=dialect_suffix
+        )
+        tsv_contents = os.listdir(dummy_tsv_path)
+
+    for produced_file in tsv_contents:
+        assert produced_file in expected_file_name
diff --git a/tests/test_data/test_summary.py b/tests/test_data/test_summary.py
@@ -0,0 +1,59 @@
+import os
+
+_REPO_DIR = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+)
+_SUMMARY = os.path.join(_REPO_DIR, "data/languages_summary.tsv")
+_TSV_DIRECTORY = os.path.join(_REPO_DIR, "data/tsv")
+
+
+def test_summary_matches_language_data():
+    """Check if each TSV referenced in data/languages_summary.tsv is
+    present in data/tsv.
+
+    (Basically checks whether generate_summary.py has been run.)
+    """
+    observed_name_to_count = {}
+
+    for unique_tsv in os.listdir(_TSV_DIRECTORY):
+        with open(
+            f"{_TSV_DIRECTORY}/{unique_tsv}", "r", encoding="utf-8"
+        ) as tsv:
+            num_of_entries = sum(1 for line in tsv)
+            observed_name_to_count[unique_tsv] = num_of_entries
+
+    with open(_SUMMARY, "r", encoding="utf-8") as lang_summary:
+        summary_files = [line.rstrip().split("\t")[0] for line in lang_summary]
+
+    for summary_file in summary_files:
+        assert (
+            summary_file in observed_name_to_count
+        ), f"{summary_file} in data/languages_summary.tsv but not in data/tsv"
+
+
+def test_language_data_matches_summary():
+    """Check if each TSV in data/tsv is present in data/languages_summary.tsv
+    and if the number of entries in each TSV matches its listed number
+    of entries in data/languages_summary.tsv.
+
+    (Basically checks whether generate_summary.py has been run.)
+    """
+    name_count_dict = {}
+    with open(_SUMMARY, "r", encoding="utf-8") as lang_summary:
+        vals = [line.rstrip().split("\t") for line in lang_summary]
+        for val in vals:
+            name_count_dict[val[0]] = int(val[-1])
+
+    for unique_tsv in os.listdir(_TSV_DIRECTORY):
+        with open(
+            f"{_TSV_DIRECTORY}/{unique_tsv}", "r", encoding="utf-8"
+        ) as tsv:
+            num_of_entries = sum(1 for line in tsv)
+        assert unique_tsv in name_count_dict, (
+            f"{unique_tsv} in data/tsv but not in "
+            "data/languages_summary.tsv"
+        )
+        assert name_count_dict[unique_tsv] == num_of_entries, (
+            f"Number of entries in {unique_tsv} does not match "
+            "number of entries in data/languages_summary.tsv."
+        )
diff --git a/tests/__init__.py → tests/test_wikipron/__init__.py b/tests/__init__.py → tests/test_wikipron/__init__.py
diff --git a/tests/test_cli.py → tests/test_wikipron/test_cli.py b/tests/test_cli.py → tests/test_wikipron/test_cli.py
diff --git a/tests/test_config.py → tests/test_wikipron/test_config.py b/tests/test_config.py → tests/test_wikipron/test_config.py
diff --git a/tests/test_extract.py → tests/test_wikipron/test_extract.py b/tests/test_extract.py → tests/test_wikipron/test_extract.py
diff --git a/tests/test_languagecodes.py → tests/test_wikipron/test_languagecodes.py b/tests/test_languagecodes.py → tests/test_wikipron/test_languagecodes.py
diff --git a/tests/test_scrape.py → tests/test_wikipron/test_scrape.py b/tests/test_scrape.py → tests/test_wikipron/test_scrape.py
diff --git a/tests/test_version.py → tests/test_wikipron/test_version.py b/tests/test_version.py → tests/test_wikipron/test_version.py
@@ -4,14 +4,15 @@
 import wikipron
 
 
-_REPO_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+_REPO_DIR = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+)
 
 
 def test_version_number_match_with_changelog():
     """__version__ and CHANGELOG.md match for the latest version number."""
     changelog = open(
-        os.path.join(_REPO_DIR, "CHANGELOG.md"),
-        encoding="utf-8",
+        os.path.join(_REPO_DIR, "CHANGELOG.md"), encoding="utf-8",
     ).read()
     # latest version number in changelog = the 1st occurrence of '[x.y.z]'
     version_in_changelog = (