diff --git a/src/virtualship/utils.py b/src/virtualship/utils.py index 980aa8ea..a3ad133d 100644 --- a/src/virtualship/utils.py +++ b/src/virtualship/utils.py @@ -1,3 +1,4 @@ +import os import warnings from datetime import timedelta from functools import lru_cache @@ -42,37 +43,33 @@ def _generic_load_yaml(data: str, model: BaseModel) -> BaseModel: return model.model_validate(yaml.safe_load(data)) -def mfp_to_yaml(excel_file_path: str, yaml_output_path: str): # noqa: D417 - """ - Generates a YAML file with spatial and temporal information based on instrument data from MFP excel file. +def load_coordinates(file_path): + """Loads coordinates from a file based on its extension.""" + if not os.path.isfile(file_path): + raise FileNotFoundError(f"File not found: {file_path}") - Parameters - ---------- - - excel_file_path (str): Path to the Excel file containing coordinate and instrument data. + ext = os.path.splitext(file_path)[-1].lower() - The function: - 1. Reads instrument and location data from the Excel file. - 2. Determines the maximum depth and buffer based on the instruments present. - 3. Ensures longitude and latitude values remain valid after applying buffer adjustments. - 4. returns the yaml information. + try: + if ext in [".xls", ".xlsx"]: + return pd.read_excel(file_path) - """ - # Importing Schedule and related models from expedition module - from virtualship.expedition.instrument_type import InstrumentType - from virtualship.expedition.schedule import Schedule - from virtualship.expedition.space_time_region import ( - SpaceTimeRegion, - SpatialRange, - TimeRange, - ) - from virtualship.expedition.waypoint import Location, Waypoint + if ext == ".csv": + return pd.read_csv(file_path) + + raise ValueError(f"Unsupported file extension {ext}.") + except Exception as e: + raise RuntimeError( + "Could not read coordinates data from the provided file. " + "Ensure it is either a csv or excel file." + ) from e + + +def validate_coordinates(coordinates_data): # Expected column headers expected_columns = {"Station Type", "Name", "Latitude", "Longitude", "Instrument"} - # Read data from Excel - coordinates_data = pd.read_excel(excel_file_path) - # Check if the headers match the expected ones actual_columns = set(coordinates_data.columns) @@ -104,6 +101,51 @@ def mfp_to_yaml(excel_file_path: str, yaml_output_path: str): # noqa: D417 # Continue with the rest of the function after validation... coordinates_data = coordinates_data.dropna() + # Convert latitude and longitude to floats, replacing commas with dots + # Handles case when the latitude and longitude have decimals with commas + if coordinates_data["Latitude"].dtype in ["object", "string"]: + coordinates_data["Latitude"] = coordinates_data["Latitude"].apply( + lambda x: float(x.replace(",", ".")) + ) + + if coordinates_data["Longitude"].dtype in ["object", "string"]: + coordinates_data["Longitude"] = coordinates_data["Longitude"].apply( + lambda x: float(x.replace(",", ".")) + ) + + return coordinates_data + + +def mfp_to_yaml(coordinates_file_path: str, yaml_output_path: str): # noqa: D417 + """ + Generates a YAML file with spatial and temporal information based on instrument data from MFP excel file. + + Parameters + ---------- + - excel_file_path (str): Path to the Excel file containing coordinate and instrument data. + + The function: + 1. Reads instrument and location data from the Excel file. + 2. Determines the maximum depth and buffer based on the instruments present. + 3. Ensures longitude and latitude values remain valid after applying buffer adjustments. + 4. returns the yaml information. + + """ + # Importing Schedule and related models from expedition module + from virtualship.expedition.instrument_type import InstrumentType + from virtualship.expedition.schedule import Schedule + from virtualship.expedition.space_time_region import ( + SpaceTimeRegion, + SpatialRange, + TimeRange, + ) + from virtualship.expedition.waypoint import Location, Waypoint + + # Read data from file + coordinates_data = load_coordinates(coordinates_file_path) + + coordinates_data = validate_coordinates(coordinates_data) + # maximum depth (in meters), buffer (in degrees) for each instrument instrument_max_depths = { "XBT": 2000, diff --git a/tests/test_mfp_to_yaml.py b/tests/test_mfp_to_yaml.py index 10d9b93d..a3175185 100644 --- a/tests/test_mfp_to_yaml.py +++ b/tests/test_mfp_to_yaml.py @@ -1,3 +1,5 @@ +import os + import pandas as pd import pytest @@ -11,27 +13,71 @@ def valid_mfp_data(): { "Station Type": ["A", "B", "C"], "Name": ["Station1", "Station2", "Station3"], - "Latitude": [30, 31, 32], - "Longitude": [-44, -45, -46], + "Latitude": [30.8, 31.2, 32.5], + "Longitude": [-44.3, -45.1, -46.7], "Instrument": ["CTD, DRIFTER", "ARGO_FLOAT", "XBT, CTD, DRIFTER"], } ) +# Fixture for Excel file @pytest.fixture -def valid_mfp_file(tmp_path): +def valid_excel_mfp_file(tmp_path): path = tmp_path / "file.xlsx" valid_mfp_data().to_excel(path, index=False) - yield path + return path + + +# Fixture for CSV file +@pytest.fixture +def valid_csv_mfp_file(tmp_path): + path = tmp_path / "file.csv" + valid_mfp_data().to_csv(path, index=False) + return path + + +@pytest.fixture +def valid_csv_mfp_file_with_commas(tmp_path): + path = tmp_path / "file.csv" + valid_mfp_data().to_csv(path, decimal=",", index=False) + return path + + +@pytest.fixture +def invalid_mfp_file(tmp_path): + path = tmp_path / "file.csv" + valid_mfp_data().to_csv(path, decimal=",", sep="|", index=False) + + return path + + +@pytest.fixture +def unsupported_extension_mfp_file(tmp_path): + path = tmp_path / "file.unsupported" + valid_mfp_data().to_csv(path, index=False) + + return path + + +@pytest.fixture +def nonexistent_mfp_file(tmp_path): + path = tmp_path / "non_file.csv" + + return path + + +@pytest.fixture +def missing_instruments_column_mfp_file(tmp_path): + path = tmp_path / "file.xlsx" + valid_mfp_data().drop(columns=["Instrument"]).to_excel(path, index=False) + return path @pytest.fixture def missing_columns_mfp_file(tmp_path): path = tmp_path / "file.xlsx" - valid_mfp_data().drop(columns=["Longitude", "Instrument"]).to_excel( - path, index=False - ) - yield path + valid_mfp_data().drop(columns=["Longitude"]).to_excel(path, index=False) + return path @pytest.fixture @@ -43,8 +89,14 @@ def unexpected_header_mfp_file(tmp_path): yield path -def test_mfp_to_yaml_success(valid_mfp_file, tmp_path): - """Test that mfp_to_yaml correctly processes a valid MFP Excel file.""" +@pytest.mark.parametrize( + "fixture_name", + ["valid_excel_mfp_file", "valid_csv_mfp_file", "valid_csv_mfp_file_with_commas"], +) +def test_mfp_to_yaml_success(request, fixture_name, tmp_path): + """Test that mfp_to_yaml correctly processes a valid MFP file.""" + valid_mfp_file = request.getfixturevalue(fixture_name) + yaml_output_path = tmp_path / "schedule.yaml" # Run function (No need to mock open() for YAML, real file is created) @@ -66,15 +118,52 @@ def test_mfp_to_yaml_success(valid_mfp_file, tmp_path): ] -def test_mfp_to_yaml_missing_headers(missing_columns_mfp_file, tmp_path): - """Test that mfp_to_yaml raises an error when required columns are missing.""" +@pytest.mark.parametrize( + "fixture_name,error,match", + [ + pytest.param( + "nonexistent_mfp_file", + FileNotFoundError, + os.path.basename("/non_file.csv"), + id="FileNotFound", + ), + pytest.param( + "unsupported_extension_mfp_file", + RuntimeError, + "Could not read coordinates data from the provided file. Ensure it is either a csv or excel file.", + id="UnsupportedExtension", + ), + pytest.param( + "invalid_mfp_file", + RuntimeError, + "Could not read coordinates data from the provided file. Ensure it is either a csv or excel file.", + id="InvalidFile", + ), + pytest.param( + "missing_instruments_column_mfp_file", + ValueError, + "Error: Missing column 'Instrument'. Have you added this column after exporting from MFP?", + id="MissingInstruments", + ), + pytest.param( + "missing_columns_mfp_file", + ValueError, + ( + r"Error: Found columns \[.*?('Station Type'| 'Name'| 'Latitude'| 'Instrument').*?\], " + r"but expected columns \[.*?('Station Type'| 'Name'| 'Latitude'| 'Instrument'| 'Longitude').*?\]." + ), + id="MissingColumns", + ), + ], +) +def test_mfp_to_yaml_exceptions(request, fixture_name, error, match, tmp_path): + """Test that mfp_to_yaml raises an error when input file is not valid.""" + fixture = request.getfixturevalue(fixture_name) + yaml_output_path = tmp_path / "schedule.yaml" - with pytest.raises( - ValueError, - match="Error: Missing column 'Instrument'. Have you added this column after exporting from MFP?", - ): - mfp_to_yaml(missing_columns_mfp_file, yaml_output_path) + with pytest.raises(error, match=match): + mfp_to_yaml(fixture, yaml_output_path) def test_mfp_to_yaml_extra_headers(unexpected_header_mfp_file, tmp_path): @@ -85,12 +174,12 @@ def test_mfp_to_yaml_extra_headers(unexpected_header_mfp_file, tmp_path): mfp_to_yaml(unexpected_header_mfp_file, yaml_output_path) -def test_mfp_to_yaml_instrument_conversion(valid_mfp_file, tmp_path): +def test_mfp_to_yaml_instrument_conversion(valid_excel_mfp_file, tmp_path): """Test that instruments are correctly converted into InstrumentType enums.""" yaml_output_path = tmp_path / "schedule.yaml" # Run function - mfp_to_yaml(valid_mfp_file, yaml_output_path) + mfp_to_yaml(valid_excel_mfp_file, yaml_output_path) # Load the generated YAML data = Schedule.from_yaml(yaml_output_path)