Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add function to get EdifactFormat and Version of files, with tests #124

Merged
merged 12 commits into from
Jan 31, 2024
Merged
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies = [
"aiohttp-requests>=0.2.2",
"pypdf>=3.4.1",
"maus>=0.4.1",
"pytz>=2022.7.1",
]
dynamic = ["readme", "version"]

Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ packaging==23.2
# via marshmallow
pypdf==4.0.1
# via edi_energy_scraper (pyproject.toml)
pytz==2023.3.post1
# via pandas
types-pytz==2023.3.1.0
# via pytz
soupsieve==2.5
# via beautifulsoup4
yarl==1.9.4
Expand Down
28 changes: 25 additions & 3 deletions src/edi_energy_scraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@
from email.message import Message
from pathlib import Path
from random import randint
from typing import Awaitable, Dict, Optional, Set, Union
from typing import Awaitable, Dict, List, Optional, Set, Tuple, Union

import aiohttp
import pytz
from aiohttp import ServerDisconnectedError
from aiohttp_requests import Requests # type:ignore[import]
from bs4 import BeautifulSoup, Comment # type:ignore[import]
from maus.edifact import EdifactFormat, EdifactFormatVersion, get_edifact_format_version
from pypdf import PdfReader

from edi_energy_scraper.epoch import Epoch
Expand Down Expand Up @@ -328,7 +330,7 @@ async def mirror(self):
with open(epoch_path, "w+", encoding="utf8") as outfile:
outfile.write(epoch_soup.prettify())
file_map = EdiEnergyScraper.get_epoch_file_map(epoch_soup)
download_tasks: list[Awaitable[Optional[Path]]] = []
download_tasks: List[Awaitable[Optional[Path]]] = []
file_counter = itertools.count()
for file_basename, link in file_map.items():
download_tasks.append(
Expand All @@ -339,9 +341,29 @@ async def mirror(self):
f"Successfully downloaded {_epoch} file {next(file_counter)}/{len(file_map)}",
)
)
download_results: list[Optional[Path]] = await asyncio.gather(*download_tasks)
download_results: List[Optional[Path]] = await asyncio.gather(*download_tasks)
for download_result in download_results:
if download_result is not None:
new_file_paths.add(download_result)
self.remove_no_longer_online_files(new_file_paths)
_logger.info("Finished mirroring")


def get_edifact_version_and_formats(path: Path) -> Tuple[EdifactFormatVersion, List[EdifactFormat]]:
"""
Determines the edifact formats and the version of a given file.
A file can describe more than one format (for example APERAK and CONTRL).
Therefore, a list of all formats described in a file is returned.
"""
filename = path.stem
date_string = filename.split("_")[-1] # Assuming date is in the last part of filename
date_format = "%Y%m%d"
berlin = pytz.timezone("Europe/Berlin")
berlin_local_time = datetime.datetime.strptime(date_string, date_format).astimezone(berlin)
version = get_edifact_format_version(berlin_local_time)
list_of_edifactformats: List[EdifactFormat] = []
for entry in EdifactFormat:
if str(entry) in filename:
list_of_edifactformats.append(entry)

return version, list_of_edifactformats
83 changes: 82 additions & 1 deletion unittests/test_edienergyscraper.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from pathlib import Path
from typing import List, Optional, Tuple

import pytest
from aioresponses import aioresponses
from bs4 import BeautifulSoup
from maus.edifact import EdifactFormat, EdifactFormatVersion

from edi_energy_scraper import EdiEnergyScraper, Epoch
from edi_energy_scraper import EdiEnergyScraper, Epoch, get_edifact_version_and_formats


class TestEdiEnergyScraper:
Expand Down Expand Up @@ -434,3 +436,82 @@ async def test_mirroring(self, mocker, tmpdir_factory, datafiles, caplog):
}
remove_no_longer_online_files_mocker.assert_called_once_with(test_new_file_paths)
assert "Downloaded index.html" in caplog.messages

@pytest.mark.parametrize(
"input_filename, expected_result",
[
pytest.param(
"APERAKMIG-informatorischeLesefassung2.1h_99991231_20221001.docx",
(EdifactFormatVersion.FV2210, [EdifactFormat.APERAK]),
),
pytest.param("COMDISMIG1.0c_20240331_20231001.pdf", (EdifactFormatVersion.FV2310, [EdifactFormat.COMDIS])),
pytest.param("CONTRLMIG2.0b_99991231_20221001.pdf", (EdifactFormatVersion.FV2210, [EdifactFormat.CONTRL])),
pytest.param(
"IFTSTAAHB-informatorischeLesefassung2.0e_99991231_20231001.docx",
(EdifactFormatVersion.FV2310, [EdifactFormat.IFTSTA]),
),
pytest.param(
"INSRPTAHB1.1g_99991231_20221001.pdf",
(EdifactFormatVersion.FV2210, [EdifactFormat.INSRPT]),
),
pytest.param("INVOICMIG2.8b_20240331_20231001.pdf", (EdifactFormatVersion.FV2310, [EdifactFormat.INVOIC])),
pytest.param("MSCONSAHB3.1c_20240331_20231001.pdf", (EdifactFormatVersion.FV2310, [EdifactFormat.MSCONS])),
pytest.param("ORDCHGMIG1.1_99991231_20231001.pdf", (EdifactFormatVersion.FV2310, [EdifactFormat.ORDCHG])),
pytest.param("ORDERSMIG1.3_99991231_20231001.pdf", (EdifactFormatVersion.FV2310, [EdifactFormat.ORDERS])),
pytest.param("ORDRSPMIG1.3_99991231_20231001.pdf", (EdifactFormatVersion.FV2310, [EdifactFormat.ORDRSP])),
pytest.param("PRICATAHB2.0c_20240331_20231001.pdf", (EdifactFormatVersion.FV2310, [EdifactFormat.PRICAT])),
pytest.param("QUOTESMIG1.3_99991231_20231001.pdf", (EdifactFormatVersion.FV2310, [EdifactFormat.QUOTES])),
pytest.param("REMADVMIG2.9b_20240331_20231001.pdf", (EdifactFormatVersion.FV2310, [EdifactFormat.REMADV])),
pytest.param("REQOTEMIG1.3_99991231_20231001.pdf", (EdifactFormatVersion.FV2310, [EdifactFormat.REQOTE])),
pytest.param(
"UTILMDAHBGas1.0a_99991231_20231001.pdf", (EdifactFormatVersion.FV2310, [EdifactFormat.UTILMD])
),
pytest.param(
"UTILTSAHBBerechnungsformel1.0e_20240331_20231001.pdf",
(EdifactFormatVersion.FV2310, [EdifactFormat.UTILTS]),
),
pytest.param(
"APERAKCONTRLAHB2.3m_20240331_20231001.pdf",
(EdifactFormatVersion.FV2310, [EdifactFormat.APERAK, EdifactFormat.CONTRL]),
),
pytest.param(
"INVOICREMADVAHB2.5b_20240331_20231001.pdf",
(EdifactFormatVersion.FV2310, [EdifactFormat.INVOIC, EdifactFormat.REMADV]),
),
pytest.param(
"ORDERSORDRSPAHBMaBiS2.2c_99991231_20231001.pdf",
(EdifactFormatVersion.FV2310, [EdifactFormat.ORDERS, EdifactFormat.ORDRSP]),
),
pytest.param(
"REQOTEQUOTESORDERSORDRSPORDCHGAHB2.2_99991231_20231001.pdf",
(
EdifactFormatVersion.FV2310,
[
EdifactFormat.ORDCHG,
EdifactFormat.ORDERS,
EdifactFormat.ORDRSP,
EdifactFormat.QUOTES,
EdifactFormat.REQOTE,
],
),
),
pytest.param(
"CodelistedereuropäischenLändercodes1.0_99991231_20171001.pdf", (EdifactFormatVersion.FV2104, [])
),
pytest.param("CodelistederZeitreihentypen1.1d_99991231_20211001.pdf", (EdifactFormatVersion.FV2110, [])),
pytest.param("KostenblattFB1.0b_99991231_20230401.pdf", (EdifactFormatVersion.FV2304, [])),
pytest.param("PARTINMIG1.0c_20240331_20240403.pdf", (EdifactFormatVersion.FV2404, [EdifactFormat.PARTIN])),
pytest.param("PARTINMIG1.0c_20240331_20241001.pdf", (EdifactFormatVersion.FV2410, [EdifactFormat.PARTIN])),
pytest.param("PARTINMIG1.0c_20240331_20250401.pdf", (EdifactFormatVersion.FV2504, [EdifactFormat.PARTIN])),
pytest.param("PARTINMIG1.0c_20240331_20251001.pdf", (EdifactFormatVersion.FV2510, [EdifactFormat.PARTIN])),
],
)
def test_get_edifact_version_and_formats(
self, input_filename: str, expected_result: Tuple[EdifactFormatVersion, List[EdifactFormat]]
):
"""
Tests the determination of the edifact format and version for given files
"""
actual = get_edifact_version_and_formats(Path(input_filename))

assert actual == expected_result