Skip to content

Commit

Permalink
Some cleanup and finishing tests
Browse files Browse the repository at this point in the history
  • Loading branch information
tobias-watzel committed Aug 31, 2023
1 parent 0deaa72 commit bb6778f
Show file tree
Hide file tree
Showing 9 changed files with 179 additions and 289 deletions.
260 changes: 99 additions & 161 deletions data_extractor/code/tests/test_train_on_pdf.py

Large diffs are not rendered by default.

17 changes: 3 additions & 14 deletions data_extractor/code/tests/test_utils/test_convert_xls_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def prerequisites_convert_xls_to_csv(path_folder_temporary: Path) -> None:
path_destination_annotation = path_folder_temporary / 'interim' / 'ml' / 'annotations'
path_source_annotation.mkdir(parents = True, exist_ok = True)
path_destination_annotation.mkdir(parents = True, exist_ok = True)
project_prefix = 'corporate_data_extraction_projects'
project_prefix = str(path_folder_temporary)

with (patch('train_on_pdf.source_annotation', str(path_source_annotation)),
patch('train_on_pdf.destination_annotation', str(path_destination_annotation)),
Expand All @@ -45,12 +45,9 @@ def test_convert_xls_to_csv_download_s3():
mocked_s3c_main.download_files_in_prefix_to_dir.side_effect = lambda *args: create_single_xlsx_file(Path(args[1]))
mocked_s3c_interim = Mock(spec = s3_communication.S3Communication)

# perform the convert_xls_to_csv call
convert_xls_to_csv(s3_usage, mocked_s3c_main, mocked_s3c_interim)

# assert that function has been called
mocked_s3c_main.download_files_in_prefix_to_dir.assert_called_once()
# assert that files exists in source_annotation folder
content_folder_source_annotation = list(Path(train_on_pdf.source_annotation).glob('*.xlsx'))
assert len(content_folder_source_annotation) == 1

Expand All @@ -66,10 +63,8 @@ def test_convert_xls_to_csv_upload_s3():
mocked_s3c_interim = Mock(spec = s3_communication.S3Communication)
mocked_s3c_interim.upload_files_in_dir_to_prefix.side_effect = lambda *args: create_multiple_xlsx_files(Path(args[1]))

# perform the convert_xls_to_csv call
convert_xls_to_csv(s3_usage, mocked_s3c_main, mocked_s3c_interim)

# assert upload function has been called
mocked_s3c_interim.upload_files_in_dir_to_prefix.assert_called_once()


Expand All @@ -84,10 +79,9 @@ def test_convert_xls_to_csv_value_error_multiple_xls():
mocked_s3c_main.download_files_in_prefix_to_dir.side_effect = lambda *args: create_multiple_xlsx_files(Path(args[1]))
mocked_s3c_interim = Mock(spec = s3_communication.S3Communication)

# perform the convert_xls_to_csv call and check for ValueError
with pytest.raises(ValueError, match = 'More than one excel sheet found'):
convert_xls_to_csv(s3_usage, mocked_s3c_main, mocked_s3c_interim)
# assert that function has been called

mocked_s3c_main.download_files_in_prefix_to_dir.assert_called_once()


Expand All @@ -102,10 +96,9 @@ def test_convert_xls_to_csv_value_error_no_annotation_xls():
mocked_s3c_main.download_files_in_prefix_to_dir.side_effect = lambda *args: None
mocked_s3c_interim = Mock(spec = s3_communication.S3Communication)

# perform the convert_xls_to_csv call and check for ValueError
with pytest.raises(ValueError, match = 'No annotation excel sheet found'):
convert_xls_to_csv(s3_usage, mocked_s3c_main, mocked_s3c_interim)
# assert that function has been called
mocked_s3c_main.download_files_in_prefix_to_dir.assert_called_once()


Expand All @@ -119,10 +112,8 @@ def test_convert_xls_to_csv_s3_usage():
mocked_s3c_interim = Mock(spec = s3_communication.S3Communication)
mocked_s3c_interim.upload_files_in_dir_to_prefix.side_effect = lambda *args: create_multiple_xlsx_files(Path(args[1]))

# perform the convert_xls_to_csv call
convert_xls_to_csv(s3_usage, mocked_s3c_main, mocked_s3c_interim)

# assert that s3_usage is True and upload_files_in_dir_to_prefix has been called
mocked_s3c_interim.upload_files_in_dir_to_prefix.assert_called_once()


Expand All @@ -134,9 +125,7 @@ def test_convert_xls_to_csv_no_s3_usage():
mocked_s3c_main = Mock(spec = s3_communication.S3Communication)
mocked_s3c_interim = Mock(spec = s3_communication.S3Communication)

# perform the convert_xls_to_csv call and check for ValueError
with pytest.raises(ValueError, match = 'No annotation excel sheet found'):
convert_xls_to_csv(s3_usage, mocked_s3c_main, mocked_s3c_interim)

# assert that s3_usage is True and upload_files_in_dir_to_prefix has been called
mocked_s3c_interim.upload_files_in_dir_to_prefix.assert_not_called()
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,9 @@ def test_copy_file_without_overwrite_result(path_folder_temporary: Path):
"""
path_folder_source = path_folder_temporary / 'source'
path_folder_destination = path_folder_temporary / 'destination'

# create test file in source folder
path_folder_source_file = path_folder_source / 'test.txt'
path_folder_source_file.touch()

# execute copy_file_without_overwrite
result = copy_file_without_overwrite(str(path_folder_source), str(path_folder_destination))
assert result == True

Expand All @@ -52,14 +49,11 @@ def test_copy_file_without_overwrite_file_not_exists(path_folder_temporary: Path
"""
path_folder_source = path_folder_temporary / 'source'
path_folder_destination = path_folder_temporary / 'destination'
# create test file in source folder
path_folder_source_file = path_folder_source / 'test.txt'
path_folder_source_file.touch()

# create test file path for destination folder
path_folder_destination_file = path_folder_destination / 'test.txt'
assert not path_folder_destination_file.exists()

# execute copy_file_without_overwrite
copy_file_without_overwrite(str(path_folder_source), str(path_folder_destination))
assert path_folder_destination_file.exists()
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ def test_create_directory(path_folder_temporary: Path):
:param path_folder_temporary: Requesting the path_folder_temporary fixture
:type path_folder_temporary: Path
"""
# call create_folder and check if the folder exists
create_directory(str(path_folder_temporary))

assert path_folder_temporary.exists()


Expand All @@ -20,12 +20,10 @@ def test_create_directory_cleanup(path_folder_temporary: Path):
:param path_folder_temporary: Requesting the path_folder_temporary fixture
:type path_folder_temporary: Path
"""
# create folder with files
path_folder_temporary.mkdir(exist_ok = True)
for i in range(10):
path_current_test_file = path_folder_temporary / f'test_{i}.txt'
path_current_test_file.touch()

# call create_directory and check for empty folder
create_directory(str(path_folder_temporary))
assert not any(path_folder_temporary.iterdir())
18 changes: 9 additions & 9 deletions data_extractor/code/tests/test_utils/test_generate_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ def test_generate_text_with_s3(path_folder_temporary: Path):
:param path_folder_temporary: Requesting the path_folder_temporary fixture
:type path_folder_temporary: Path
"""
# get the path to the temporary folder
path_folder_text_3434 = path_folder_temporary / 'folder_test_3434'
project_name = 'test'

Expand All @@ -87,6 +86,7 @@ def test_generate_text_with_s3(path_folder_temporary: Path):
}
}

<<<<<<< HEAD
# run the function to test
<<<<<<< HEAD
generate_text_3434(project_name)
Expand All @@ -101,10 +101,11 @@ def test_generate_text_with_s3(path_folder_temporary: Path):
generate_text_3434(project_name)
>>>>>>> ced44e3df (Feature/2023.04 os test (#14))
=======
=======
>>>>>>> bd5098e40 (Some cleanup and finishing tests)
with (patch('train_on_pdf.S3Communication', Mock(spec=s3_communication.S3Communication)) as mocked_s3):
generate_text_3434(project_name, True, mocked_s3_settings)

# check for calls
mocked_s3.assert_any_call(s3_endpoint_url='S3_END_MAIN',
aws_access_key_id='S3_ACCESS_MAIN',
aws_secret_access_key='S3_SECRET_MAIN',
Expand All @@ -128,7 +129,6 @@ def test_generate_text_no_s3(path_folder_temporary: Path):
:param path_folder_temporary: Requesting the path_folder_temporary fixture
:type path_folder_temporary: Path
"""
# get the path to the temporary folder
path_folder_text_3434 = path_folder_temporary / 'folder_test_3434'
project_name = 'test'
s3_usage = False
Expand All @@ -140,18 +140,21 @@ def test_generate_text_no_s3(path_folder_temporary: Path):
# ensure that the header and the content form the first file is written to
# the file text_3434.csv in folder relevance and the the content of the other
# files in folder relevance is appended without the header

# check if file_3434 exists
path_file_text_3434_csv = path_folder_text_3434 / 'text_3434.csv'
assert path_file_text_3434_csv.exists()

<<<<<<< HEAD
# check if header and content of files exist
<<<<<<< HEAD
strings_expected = [
f'That is a test {line_number}' for line_number in range(5)
]
=======
>>>>>>> ced44e3df (Feature/2023.04 os test (#14))
=======
strings_expected = [f'That is a test {line_number}'
for line_number in range(5)]
>>>>>>> bd5098e40 (Some cleanup and finishing tests)
with open(str(path_file_text_3434_csv), 'r') as file_text_3434:
for line_number, line_content in enumerate(file_text_3434, start = -1):
if line_number == -1:
Expand Down Expand Up @@ -201,18 +204,16 @@ def test_generate_text_not_successful_empty_folder(path_folder_temporary: Path,

:param path_folder_temporary: Requesting the path_folder_temporary fixture
:type path_folder_temporary: Path
:param capsys: Requesting default fixture for capturing cmd output
:param capsys: Requesting the default fixture capsys for capturing cmd outputs
:type capsys: typing.Generator[CaptureFixture[str], None, None])
"""
project_name = 'test'
s3_usage = False
project_settings = None

# clear the relevance folder
path_folder_relevance = path_folder_temporary / 'relevance'
[file.unlink() for file in path_folder_relevance.glob("*") if file.is_file()]

# call the function
return_value = generate_text_3434(project_name, s3_usage, project_settings)

output_cmd, _ = capsys.readouterr()
Expand All @@ -231,7 +232,6 @@ def test_generate_text_not_successful_exception(path_folder_temporary: Path):
s3_usage = False
project_settings = None

# clear the relevance folder
path_folder_relevance = path_folder_temporary / 'relevance'
[file.unlink() for file in path_folder_relevance.glob("*") if file.is_file()]

Expand Down
9 changes: 1 addition & 8 deletions data_extractor/code/tests/test_utils/test_link_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,12 @@ def test_link_files(path_folder_temporary: Path):
path_folder_source_pdf = path_folder_temporary / 'source_pdf'
path_folder_destination = path_folder_temporary / 'destination'

# create sample files
for i in range(10):
path_current_file = path_folder_source / f'test_{i}.txt'
path_current_file.touch()

# perform the linking
link_files(str(path_folder_source), str(path_folder_destination))

# check for hard links
for i in range(10):
path_current_file = path_folder_source / f'test_{i}.txt'
assert path_current_file.stat().st_nlink == 2
Expand All @@ -60,7 +57,7 @@ def test_link_extracted_files_result(path_folder_temporary: Path):
path_folder_source = path_folder_temporary / 'source'
path_folder_source_pdf = path_folder_temporary / 'source_pdf'
path_folder_destination = path_folder_temporary / 'destination'
# single pdf and json file

path_folder_source_file_pdf = path_folder_source / f'test.pdf'
path_folder_source_file_json = path_folder_source / f'test.json'
path_source_file_pdf = path_folder_source_pdf / f'test.pdf'
Expand All @@ -82,7 +79,6 @@ def test_link_extracted_files_copy(path_folder_temporary: Path):
path_folder_source_pdf = path_folder_temporary / 'source_pdf'
path_folder_destination = path_folder_temporary / 'destination'

# create test pdf and json files in the source_extraction and source_pdf folders
for i in range(10):
path_current_file = path_folder_source / f'test_{i}.pdf'
path_current_file.touch()
Expand All @@ -91,16 +87,13 @@ def test_link_extracted_files_copy(path_folder_temporary: Path):
path_current_file = path_folder_source_pdf / f'test_{i}.pdf'
path_current_file.touch()

# check if no files exist in the destination_extraction folder
for i in range(10):
path_current_file = path_folder_destination / f'test_{i}.json'
assert not path_current_file.exists() == True

# perform extracted file linking
link_extracted_files(str(path_folder_source), str(path_folder_source_pdf),
str(path_folder_destination))

# check if files exist in the destination_extraction folder
for i in range(10):
path_current_file = path_folder_destination / f'test_{i}.json'
assert path_current_file.exists() == True
Loading

0 comments on commit bb6778f

Please sign in to comment.