From 8c0378fb30f15b06be1e3118cb8f0b537c9e4149 Mon Sep 17 00:00:00 2001 From: zbruick Date: Thu, 27 Jun 2019 12:09:42 -0600 Subject: [PATCH 1/2] Adds a module to convert pandas dataframes to netCDF files. Intended for conversion to DSG netCDF format, but if not enough parameters are set by the user, a non-CF compliant netCDF will be written, but warned about. This should work for time series, profiles, or trajectories, with main testing done against METAR dataframes. --- metpy/io/pandas_to_netcdf.py | 172 ++++++++++++++++++++++++ metpy/io/tests/test_pandas_to_netcdf.py | 95 +++++++++++++ src/metpy/io/__init__.py | 15 ++- 3 files changed, 277 insertions(+), 5 deletions(-) create mode 100644 metpy/io/pandas_to_netcdf.py create mode 100644 metpy/io/tests/test_pandas_to_netcdf.py diff --git a/metpy/io/pandas_to_netcdf.py b/metpy/io/pandas_to_netcdf.py new file mode 100644 index 00000000000..d17a6a6b801 --- /dev/null +++ b/metpy/io/pandas_to_netcdf.py @@ -0,0 +1,172 @@ +# Copyright (c) 2019 MetPy Developers. +# Distributed under the terms of the BSD 3-Clause License. +# SPDX-License-Identifier: BSD-3-Clause +"""Support reading a pandas dataframe to a DSG netCDF.""" + +import logging +from os import path + +from numpy import arange +import xarray as xr + +from ..package_tools import Exporter + +exporter = Exporter(globals()) + +log = logging.getLogger(__name__) + + +@exporter.export +def dataframe_to_netcdf(df, sampling_var, sampling_data_vars, path_to_save, netcdf_format=None, + column_units=None, standard_names=None, long_names=None, + dataset_type=None): + r"""Take a Pandas DataFrame and convert it to a netCDF file. + + If given a Pandas DataFrame, this function will first convert + it to a xarray Dataset, attach attributes and metadata to it as + provided by the user, and then save it as a CF-compliant discrete + sampling geometry (DGS) netCDF file. Assumes each row of the DataFrame + is a unique observation + + This function is ideal for point data, such as station observations, + or for trajectory or profile data, which is discretely sampled at + individual points + + Parameters + ---------- + df : `pandas.DataFrame` + Point data in pandas dataframe. + + sampling_var : str + Column name that is the sampling dimension: for surface observations, + this is the column that contains the station identifier/name + + sampling_data_vars : list + List of all variables associated with the sampling variable that do not + vary with time, such as latitude, longitude, and elevation for + surface observations + + path_to_save : str + Path, including filename, for where to save netCDF file. + + netcdf_format : str, optional + + column_units : dict, optional + Dictionary of units to attach to columns of the dataframe. Overrides + the units attribute if it is attached to the dataframe. + + standard_names : dict, optional + Dictionary of variable descriptions that are CF-compliant + + long_names : dict, optional + Dictionary of longer variable descriptions that provide more detail + than standard_names + + dataset_type: str, optional + Type of dataset to be converted. Options are 'timeSeries', 'profile', + or 'trajectory'. While optional, this variable should be declared to create + a CF-compliant DSG netCDF file. + + Returns + ------- + NetCDF file saved to `path_to_save`. + + """ + # Verify_integrity must be true in order for conversion to netCDF to work + # Return a TypeError if not provided a Pandas DataFrame + try: + # Create the dimensions for use later in netCDF file + samplingindex = df.groupby([sampling_var], sort=False).ngroup() + obs = arange(0, len(df)) + df.insert(0, 'samplingIndex', samplingindex) + df.insert(1, 'observations', obs) + + # Handle the sampling location specific data + sampling_data = df[sampling_data_vars] + samples = sampling_data.groupby([sampling_var], sort=False).ngroup() + sampling_data.insert(0, 'samples', samples) + sampling_data = sampling_data.groupby('samples').first() + dataset_samples = xr.Dataset.from_dataframe(sampling_data) + + # Create the dataset for the variables of each observation + df = df.drop(sampling_data_vars, axis=1) + df = df.set_index(['observations'], verify_integrity=True) + dataset_var = xr.Dataset.from_dataframe(df) + + # Merge the two datasets together + dataset_final = xr.merge([dataset_samples, dataset_var], compat='no_conflicts') + + except (AttributeError, ValueError, TypeError): + raise TypeError('A pandas dataframe was not provided') + + # Attach variable-specific metadata + _assign_metadata(dataset_final, column_units, standard_names, long_names) + + # Attach dataset-specific metadata + if dataset_type: + dataset_final.attrs['featureType'] = dataset_type + else: + log.warning('No dataset type provided - netCDF will not have appropriate metadata' + 'for a DSG dataset.') + if dataset_type: + dataset_final[sampling_var].attrs['cf_role'] = dataset_type.lower() + '_id' + dataset_final['samplingIndex'].attrs['instance_dimension'] = 'samples' + + # Determine mode to write to netCDF + write_mode = 'w' + if path.exists(path_to_save): + # Eventually switch to 'a' to allow appending and delete error + raise ValueError('File already exists - please delete and run again') + + # Check if netCDF4 is installed to see how many unlimited dimensions we can use + # Need conditional import for checking due to Python 2 + try: + from importlib.util import find_spec + check_netcdf4 = find_spec('netCDF4') + except ImportError: + from imp import find_module + check_netcdf4 = find_module('netCDF4') + + # Make sure path is a string to allow netCDF4 to be used - needed for tests to pass + path_to_save = str(path_to_save) + + if check_netcdf4 is not None: + unlimited_dimensions = ['samples', 'observations'] + else: + # Due to xarray's fallback to scipy if netCDF4-python is not installed + # only one dimension can be unlimited. This may cause issues for users + log.warning('NetCDF4 not installed - saving as a netCDF3 file with only the' + 'observations dimension as unlimited. If netCDF4 or multiple' + 'dimensions are desired, run `pip install netCDF4`') + unlimited_dimensions = ['observations'] + + # Convert to netCDF + dataset_final.to_netcdf(path=path_to_save, mode=write_mode, format=netcdf_format, + unlimited_dims=unlimited_dimensions, compute=True) + + +def _assign_metadata(dataset, units_dict, standard_names_dict, long_names_dict): + if units_dict is not None: + final_column_units = {} + final_column_units['samples'] = '' + final_column_units['observations'] = '' + final_column_units['samplingIndex'] = '' + final_column_units.update(units_dict) + for var in dataset.variables: + dataset[var].attrs['units'] = final_column_units[var] + if standard_names_dict is not None: + final_std_names = {} + final_std_names['samples'] = '' + final_std_names['observations'] = '' + final_std_names['samplingIndex'] = '' + final_std_names.update(standard_names_dict) + for var in dataset.variables: + dataset[var].attrs['standard_name'] = final_std_names[var] + if long_names_dict is not None: + final_long_names = {} + final_long_names['samples'] = 'Sampling dimension' + final_long_names['observations'] = 'Observation dimension' + final_long_names['samplingIndex'] = 'Index of station for this observation' + final_long_names.update(long_names_dict) + for var in dataset.variables: + dataset[var].attrs['long_name'] = final_long_names[var] diff --git a/metpy/io/tests/test_pandas_to_netcdf.py b/metpy/io/tests/test_pandas_to_netcdf.py new file mode 100644 index 00000000000..6042079e997 --- /dev/null +++ b/metpy/io/tests/test_pandas_to_netcdf.py @@ -0,0 +1,95 @@ +# Copyright (c) 2019 MetPy Developers. +# Distributed under the terms of the BSD 3-Clause License. +# SPDX-License-Identifier: BSD-3-Clause +"""Test the `pandas_to_netcdf` module.""" + +import logging +import os + +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +from metpy.cbook import get_test_data +from metpy.io import dataframe_to_netcdf + +# Turn off the warnings for tests +logging.getLogger('metpy.io.pandas_to_netcdf').setLevel(logging.CRITICAL) + + +@pytest.fixture +def test_df(): + """Create generic dataframe for testing.""" + return pd.DataFrame({ + 'temperature': pd.Series([1, 2, 2, 3]), 'pressure': pd.Series([1, 2, 2, 3]), + 'latitude': pd.Series([4, 5, 6, 7]), 'longitude': pd.Series([1, 2, 3, 4]), + 'station_id': pd.Series(['KFNL', 'KDEN', 'KVPZ', 'KORD'])}) + + +def test_dataframe_to_netcdf_basic(tmpdir): + """Test dataframe conversion to netcdf.""" + df = pd.read_csv(get_test_data('station_data.txt'), usecols=[0, 1, 2, 3, 4, 5]) + df = df.rename(columns={'latitude[unit="degrees_north"]': 'latitude', + 'longitude[unit="degrees_east"]': 'longitude', + 'air_pressure_at_sea_level[unit="hectoPascal"]': + 'mean_sea_level_pressure', + 'air_temperature[unit="Celsius"]': 'temperature'}) + dataframe_to_netcdf(df, path_to_save=str(tmpdir) + '/test.nc', sampling_var='station', + sampling_data_vars=['station', 'latitude', 'longitude']) + assert os.path.exists(str(tmpdir) + '/test.nc') + data = xr.open_dataset(str(tmpdir) + '/test.nc') + assert np.max(data['temperature']) == 27 + + +def test_dataframe_to_netcdf_units(tmpdir): + """Test units attached via a dictionary.""" + df = pd.read_csv(get_test_data('station_data.txt'), usecols=[0, 1, 2, 3, 4, 5]) + df = df.rename(columns={'latitude[unit="degrees_north"]': 'latitude', + 'longitude[unit="degrees_east"]': 'longitude', + 'air_pressure_at_sea_level[unit="hectoPascal"]': + 'mean_sea_level_pressure', + 'air_temperature[unit="Celsius"]': 'temperature'}) + col_units = {'samples': '', 'observations': '', 'samplingIndex': '', 'station': '', + 'latitude': 'degrees', 'longitude': 'degrees', 'temperature': 'degC', + 'mean_sea_level_pressure': 'hPa', 'time': ''} + dataframe_to_netcdf(df, path_to_save=str(tmpdir) + '/test.nc', sampling_var='station', + sampling_data_vars=['station', 'latitude', 'longitude'], + column_units=col_units, dataset_type='timeSeries') + data = xr.open_dataset(str(tmpdir) + '/test.nc') + assert data['station'].attrs['cf_role'] == 'timeseries_id' + assert data['temperature'].attrs['units'] == 'degC' + + +def test_dataframe_to_netcdf_names(test_df, tmpdir): + """Test attachment of standard names via a dictionary.""" + long_names = {'temperature': '2-meter air temperature', + 'pressure': 'Mean sea-level air pressure', 'latitude': 'Station latitude', + 'longitude': 'Station longitude', 'station_id': 'Station identifier'} + standard_names = {'temperature': 'air_temperature', + 'pressure': 'air_pressure_at_mean_sea_level', 'latitude': 'latitude', + 'longitude': 'longitude', 'station_id': 'platform_id'} + dataframe_to_netcdf(test_df, path_to_save=str(tmpdir) + '/test.nc', + sampling_var='station_id', + sampling_data_vars=['station_id', 'latitude', 'longitude'], + standard_names=standard_names, long_names=long_names) + data = xr.open_dataset(str(tmpdir) + '/test.nc') + assert data['temperature'].attrs['standard_name'] == 'air_temperature' + assert data['station_id'].attrs['long_name'] == 'Station identifier' + + +def test_no_dataframe(tmpdir): + """Test error message if Pandas DataFrame is not provided.""" + array = np.arange(0, 10) + with pytest.raises(TypeError, match='A pandas dataframe was not provided'): + dataframe_to_netcdf(array, path_to_save=str(tmpdir) + '/test.nc', sampling_var=None, + sampling_data_vars=None) + + +def test_file_exists(test_df, tmpdir): + """Test error message if netCDF file already exists.""" + open(str(tmpdir) + '/test.nc', 'wb') + with pytest.raises(ValueError, match='File already exists - please delete and run again'): + dataframe_to_netcdf(test_df, path_to_save=str(tmpdir) + '/test.nc', + sampling_var='station_id', + sampling_data_vars=['station_id', 'latitude', 'longitude']) diff --git a/src/metpy/io/__init__.py b/src/metpy/io/__init__.py index 5772e516c21..85e2c988462 100644 --- a/src/metpy/io/__init__.py +++ b/src/metpy/io/__init__.py @@ -1,15 +1,20 @@ -# Copyright (c) 2015,2016,2018 MetPy Developers. +# Copyright (c) 2015,2016,2018,2019 MetPy Developers. # Distributed under the terms of the BSD 3-Clause License. # SPDX-License-Identifier: BSD-3-Clause -"""Classes for reading various file formats. +"""Classes for reading and writing various file formats. -These classes are written to take both file names (for local files) or file-like objects; -this allows reading files that are already in memory (using :class:`python:io.StringIO`) -or remote files (using :func:`~python:urllib.request.urlopen`). +The gini and nexrad classes are written to take both file names (for local files) +or file-like objects; this allows reading files that are already in memory (using +:class:`python:io.StringIO`) or remote files (using :func:`~python:urllib.request.urlopen`). + +The `dataframe_to_netcdf` function take a pandas dataframe and writes a netCDF file (in DSG +format if applicable). """ from .gini import * # noqa: F403 from .nexrad import * # noqa: F403 +from .pandas_to_netcdf import * # noqa: F403 __all__ = gini.__all__[:] # pylint: disable=undefined-variable __all__.extend(nexrad.__all__) # pylint: disable=undefined-variable +__all__.extend(pandas_to_netcdf.__all__) # pylint: disable=undefined-variable From 4dad48f6821b939752facb17b7a44b7b95e51068 Mon Sep 17 00:00:00 2001 From: zbruick Date: Thu, 15 Aug 2019 14:53:28 -0600 Subject: [PATCH 2/2] Add appending capability to pandas_to_netcdf --- {metpy => src/metpy}/io/pandas_to_netcdf.py | 105 +++++++++++++----- .../io}/test_pandas_to_netcdf.py | 75 ++++++++++--- 2 files changed, 138 insertions(+), 42 deletions(-) rename {metpy => src/metpy}/io/pandas_to_netcdf.py (62%) rename {metpy/io/tests => tests/io}/test_pandas_to_netcdf.py (51%) diff --git a/metpy/io/pandas_to_netcdf.py b/src/metpy/io/pandas_to_netcdf.py similarity index 62% rename from metpy/io/pandas_to_netcdf.py rename to src/metpy/io/pandas_to_netcdf.py index d17a6a6b801..562e2f341cc 100644 --- a/metpy/io/pandas_to_netcdf.py +++ b/src/metpy/io/pandas_to_netcdf.py @@ -4,9 +4,10 @@ """Support reading a pandas dataframe to a DSG netCDF.""" import logging -from os import path +import os from numpy import arange +import pandas as pd import xarray as xr from ..package_tools import Exporter @@ -17,9 +18,9 @@ @exporter.export -def dataframe_to_netcdf(df, sampling_var, sampling_data_vars, path_to_save, netcdf_format=None, - column_units=None, standard_names=None, long_names=None, - dataset_type=None): +def dataframe_to_netcdf(df, mode, sampling_var, sampling_data_vars, path_to_save, + netcdf_format=None, column_units=None, standard_names=None, + long_names=None, dataset_type=None): r"""Take a Pandas DataFrame and convert it to a netCDF file. If given a Pandas DataFrame, this function will first convert @@ -37,6 +38,11 @@ def dataframe_to_netcdf(df, sampling_var, sampling_data_vars, path_to_save, netc df : `pandas.DataFrame` Point data in pandas dataframe. + mode : str + Specify whether to write ('w') a new netCDF file or append ('a') to an existing file. + If 'w' is specified and the `path_to_save` already exists, the file will be + overwritten. + sampling_var : str Column name that is the sampling dimension: for surface observations, this is the column that contains the station identifier/name @@ -71,6 +77,29 @@ def dataframe_to_netcdf(df, sampling_var, sampling_data_vars, path_to_save, netc ------- NetCDF file saved to `path_to_save`. + Notes + ----- + If append mode is used, all metadata will be preserved, but will be overwritten by + user input. + + """ + if mode == 'w': + _write_to_netcdf(df, sampling_var, sampling_data_vars, path_to_save, + netcdf_format, column_units, standard_names, long_names, + dataset_type) + elif mode == 'a': + _append_to_netcdf(df, sampling_var, sampling_data_vars, path_to_save, + netcdf_format, column_units, standard_names, long_names, + dataset_type) + else: + raise ValueError('Mode must either be "w" or "a".') + + +def _write_to_netcdf(df, sampling_var, sampling_data_vars, path_to_save, netcdf_format, + column_units, standard_names, long_names, dataset_type): + """Write Pandas DataFrame to netCDF file. + + This will overwrite any existing file at `path_to_save`. """ # Verify_integrity must be true in order for conversion to netCDF to work # Return a TypeError if not provided a Pandas DataFrame @@ -105,18 +134,15 @@ def dataframe_to_netcdf(df, sampling_var, sampling_data_vars, path_to_save, netc # Attach dataset-specific metadata if dataset_type: dataset_final.attrs['featureType'] = dataset_type + dataset_final[sampling_var].attrs['cf_role'] = dataset_type.lower() + '_id' else: log.warning('No dataset type provided - netCDF will not have appropriate metadata' 'for a DSG dataset.') - if dataset_type: - dataset_final[sampling_var].attrs['cf_role'] = dataset_type.lower() + '_id' dataset_final['samplingIndex'].attrs['instance_dimension'] = 'samples' - # Determine mode to write to netCDF - write_mode = 'w' - if path.exists(path_to_save): - # Eventually switch to 'a' to allow appending and delete error - raise ValueError('File already exists - please delete and run again') + # Remove any existing file + if os.path.exists(str(path_to_save)): + os.remove(str(path_to_save)) # Check if netCDF4 is installed to see how many unlimited dimensions we can use # Need conditional import for checking due to Python 2 @@ -127,9 +153,6 @@ def dataframe_to_netcdf(df, sampling_var, sampling_data_vars, path_to_save, netc from imp import find_module check_netcdf4 = find_module('netCDF4') - # Make sure path is a string to allow netCDF4 to be used - needed for tests to pass - path_to_save = str(path_to_save) - if check_netcdf4 is not None: unlimited_dimensions = ['samples', 'observations'] else: @@ -141,27 +164,52 @@ def dataframe_to_netcdf(df, sampling_var, sampling_data_vars, path_to_save, netc unlimited_dimensions = ['observations'] # Convert to netCDF - dataset_final.to_netcdf(path=path_to_save, mode=write_mode, format=netcdf_format, + dataset_final.to_netcdf(path=str(path_to_save), mode='w', format=netcdf_format, unlimited_dims=unlimited_dimensions, compute=True) +def _append_to_netcdf(df, sampling_var, sampling_data_vars, path_to_save, + netcdf_format, column_units, standard_names, long_names, dataset_type): + """Append to existing netCDF file.""" + ds = xr.open_dataset(str(path_to_save)) + df_old = (ds.to_dataframe().reset_index() + .drop(columns=['samplingIndex', 'observations', 'samples'])) + df_new = pd.concat([df_old, df], sort=False).reset_index(drop=True) # Pandas dependency + + # Assign metadata here + if dataset_type is None and 'featureType' in ds.attrs: + dataset_type = ds.attrs['featureType'] + append_column_units = {} + append_standard_names = {} + append_long_names = {} + for var_name, da in ds.data_vars.items(): + if 'units' in da.attrs: + append_column_units[var_name] = da.attrs['units'] + if 'standard_name' in da.attrs: + append_standard_names[var_name] = da.attrs['standard_name'] + if 'long_name' in da.attrs: + append_long_names[var_name] = da.attrs['long_name'] + if column_units is not None: + append_column_units.update(column_units) + if standard_names is not None: + append_standard_names.update(standard_names) + if long_names is not None: + append_long_names.update(long_names) + + _write_to_netcdf(df_new, sampling_var, sampling_data_vars, path_to_save, + netcdf_format, append_column_units, append_standard_names, + append_long_names, dataset_type) + + def _assign_metadata(dataset, units_dict, standard_names_dict, long_names_dict): if units_dict is not None: - final_column_units = {} - final_column_units['samples'] = '' - final_column_units['observations'] = '' - final_column_units['samplingIndex'] = '' - final_column_units.update(units_dict) for var in dataset.variables: - dataset[var].attrs['units'] = final_column_units[var] + if var in units_dict: + dataset[var].attrs['units'] = units_dict[var] if standard_names_dict is not None: - final_std_names = {} - final_std_names['samples'] = '' - final_std_names['observations'] = '' - final_std_names['samplingIndex'] = '' - final_std_names.update(standard_names_dict) for var in dataset.variables: - dataset[var].attrs['standard_name'] = final_std_names[var] + if var in standard_names_dict: + dataset[var].attrs['standard_name'] = standard_names_dict[var] if long_names_dict is not None: final_long_names = {} final_long_names['samples'] = 'Sampling dimension' @@ -169,4 +217,5 @@ def _assign_metadata(dataset, units_dict, standard_names_dict, long_names_dict): final_long_names['samplingIndex'] = 'Index of station for this observation' final_long_names.update(long_names_dict) for var in dataset.variables: - dataset[var].attrs['long_name'] = final_long_names[var] + if var in final_long_names: + dataset[var].attrs['long_name'] = final_long_names[var] diff --git a/metpy/io/tests/test_pandas_to_netcdf.py b/tests/io/test_pandas_to_netcdf.py similarity index 51% rename from metpy/io/tests/test_pandas_to_netcdf.py rename to tests/io/test_pandas_to_netcdf.py index 6042079e997..c0755531386 100644 --- a/metpy/io/tests/test_pandas_to_netcdf.py +++ b/tests/io/test_pandas_to_netcdf.py @@ -27,6 +27,15 @@ def test_df(): 'station_id': pd.Series(['KFNL', 'KDEN', 'KVPZ', 'KORD'])}) +@pytest.fixture +def test_df2(): + """Create generic dataframe for appending.""" + return pd.DataFrame({ + 'temperature': pd.Series([20]), 'pressure': pd.Series([1010]), + 'latitude': pd.Series([40]), 'longitude': pd.Series([-65]), + 'station_id': pd.Series(['KLGA'])}) + + def test_dataframe_to_netcdf_basic(tmpdir): """Test dataframe conversion to netcdf.""" df = pd.read_csv(get_test_data('station_data.txt'), usecols=[0, 1, 2, 3, 4, 5]) @@ -35,8 +44,9 @@ def test_dataframe_to_netcdf_basic(tmpdir): 'air_pressure_at_sea_level[unit="hectoPascal"]': 'mean_sea_level_pressure', 'air_temperature[unit="Celsius"]': 'temperature'}) - dataframe_to_netcdf(df, path_to_save=str(tmpdir) + '/test.nc', sampling_var='station', - sampling_data_vars=['station', 'latitude', 'longitude']) + dataframe_to_netcdf(df, mode='w', path_to_save=str(tmpdir) + '/test.nc', + sampling_var='station', sampling_data_vars=['station', 'latitude', + 'longitude']) assert os.path.exists(str(tmpdir) + '/test.nc') data = xr.open_dataset(str(tmpdir) + '/test.nc') assert np.max(data['temperature']) == 27 @@ -50,10 +60,10 @@ def test_dataframe_to_netcdf_units(tmpdir): 'air_pressure_at_sea_level[unit="hectoPascal"]': 'mean_sea_level_pressure', 'air_temperature[unit="Celsius"]': 'temperature'}) - col_units = {'samples': '', 'observations': '', 'samplingIndex': '', 'station': '', - 'latitude': 'degrees', 'longitude': 'degrees', 'temperature': 'degC', - 'mean_sea_level_pressure': 'hPa', 'time': ''} - dataframe_to_netcdf(df, path_to_save=str(tmpdir) + '/test.nc', sampling_var='station', + col_units = {'latitude': 'degrees', 'longitude': 'degrees', 'temperature': 'degC', + 'mean_sea_level_pressure': 'hPa'} + dataframe_to_netcdf(df, mode='w', path_to_save=str(tmpdir) + '/test.nc', + sampling_var='station', sampling_data_vars=['station', 'latitude', 'longitude'], column_units=col_units, dataset_type='timeSeries') data = xr.open_dataset(str(tmpdir) + '/test.nc') @@ -69,7 +79,7 @@ def test_dataframe_to_netcdf_names(test_df, tmpdir): standard_names = {'temperature': 'air_temperature', 'pressure': 'air_pressure_at_mean_sea_level', 'latitude': 'latitude', 'longitude': 'longitude', 'station_id': 'platform_id'} - dataframe_to_netcdf(test_df, path_to_save=str(tmpdir) + '/test.nc', + dataframe_to_netcdf(test_df, mode='w', path_to_save=str(tmpdir) + '/test.nc', sampling_var='station_id', sampling_data_vars=['station_id', 'latitude', 'longitude'], standard_names=standard_names, long_names=long_names) @@ -82,14 +92,51 @@ def test_no_dataframe(tmpdir): """Test error message if Pandas DataFrame is not provided.""" array = np.arange(0, 10) with pytest.raises(TypeError, match='A pandas dataframe was not provided'): - dataframe_to_netcdf(array, path_to_save=str(tmpdir) + '/test.nc', sampling_var=None, - sampling_data_vars=None) + dataframe_to_netcdf(array, mode='w', path_to_save=str(tmpdir) + '/test.nc', + sampling_var=None, sampling_data_vars=None) -def test_file_exists(test_df, tmpdir): - """Test error message if netCDF file already exists.""" - open(str(tmpdir) + '/test.nc', 'wb') - with pytest.raises(ValueError, match='File already exists - please delete and run again'): - dataframe_to_netcdf(test_df, path_to_save=str(tmpdir) + '/test.nc', +def test_invalid_mode_option(test_df, tmpdir): + """Test error message if an incorrect file mode is specified.""" + with pytest.raises(ValueError, match='Mode must either be "w" or "a".'): + dataframe_to_netcdf(test_df, mode='r', path_to_save=str(tmpdir) + '/test.nc', sampling_var='station_id', sampling_data_vars=['station_id', 'latitude', 'longitude']) + + +def test_append_basic(test_df, test_df2, tmpdir): + """Test appending to an existing file.""" + dataframe_to_netcdf(test_df, mode='w', path_to_save=str(tmpdir) + '/test.nc', + sampling_var='station_id', + sampling_data_vars=['station_id', 'latitude', 'longitude']) + dataframe_to_netcdf(test_df2, mode='a', path_to_save=str(tmpdir) + '/test.nc', + sampling_var='station_id', + sampling_data_vars=['station_id', 'latitude', 'longitude']) + data = xr.open_dataset(str(tmpdir) + '/test.nc') + assert 'KLGA' in data['station_id'] + assert data.dims['samples'] == 5 + assert data.dims['observations'] == 17 + + +def test_append_attributes(test_df, test_df2, tmpdir): + """Test appending dataset with existing attributes.""" + units = {'temperature': 'degC', 'pressure': 'hPa', 'latitude': 'degrees', + 'longitude': 'degrees'} + long_names = {'temperature': '2-meter air temperature', + 'pressure': 'Mean sea-level air pressure', 'latitude': 'Station latitude', + 'longitude': 'Station longitude', 'station_id': 'Station identifier'} + standard_names = {'temperature': 'air_temperature', + 'pressure': 'air_pressure_at_mean_sea_level', 'latitude': 'latitude', + 'longitude': 'longitude', 'station_id': 'platform_id'} + dataframe_to_netcdf(test_df, mode='w', path_to_save=str(tmpdir) + '/test.nc', + sampling_var='station_id', + sampling_data_vars=['station_id', 'latitude', 'longitude'], + column_units=units, standard_names=standard_names, + long_names=long_names, dataset_type='timeSeries') + dataframe_to_netcdf(test_df2, mode='a', path_to_save=str(tmpdir) + '/test.nc', + sampling_var='station_id', + sampling_data_vars=['station_id', 'latitude', 'longitude']) + data = xr.open_dataset(str(tmpdir) + '/test.nc') + assert data.temperature.attrs['units'] == 'degC' + assert data.attrs['featureType'] == 'timeSeries' + assert data.station_id.attrs['cf_role'] == 'timeseries_id'