diff --git a/docs/sphinx/source/whatsnew/v0.9.1.rst b/docs/sphinx/source/whatsnew/v0.9.1.rst index 77b0956366..fe731bbd66 100644 --- a/docs/sphinx/source/whatsnew/v0.9.1.rst +++ b/docs/sphinx/source/whatsnew/v0.9.1.rst @@ -16,6 +16,7 @@ Deprecations Enhancements ~~~~~~~~~~~~ +* Added ``map_variables`` option to :func:`~pvlib.iotools.read_crn` (:pull:`1368`) * Added :py:func:`pvlib.temperature.prilliman` for modeling cell temperature at short time steps (:issue:`1081`, :pull:`1391`) @@ -29,6 +30,8 @@ Bug fixes argument was not being passed to the ``optimalinclination`` request parameter (:pull:`1356`) * Fixed bug in :py:func:`pvlib.bifacial.pvfactors_timeseries` where scalar ``surface_tilt`` and ``surface_azimuth`` inputs caused an error (:issue:`1127`, :issue:`1332`, :pull:`1361`) +* Added -99999 to list of values to map to nan in :func:`~pvlib.iotools.read_crn` + (:issue:`1372`, :pull:`1368`) * Changed the metadata entry for the wind speed unit to "Wind Speed Units" in the PSM3 iotools function (:pull:`1375`) * Improved convergence when determining the maximum power point using diff --git a/pvlib/data/CRN_with_problems.txt b/pvlib/data/CRN_with_problems.txt index f6bdfc54d6..25023dcaee 100644 Binary files a/pvlib/data/CRN_with_problems.txt and b/pvlib/data/CRN_with_problems.txt differ diff --git a/pvlib/iotools/crn.py b/pvlib/iotools/crn.py index c3b0e8776f..fe46bb69d2 100644 --- a/pvlib/iotools/crn.py +++ b/pvlib/iotools/crn.py @@ -2,15 +2,14 @@ """ import pandas as pd -import numpy as np -HEADERS = ( - 'WBANNO UTC_DATE UTC_TIME LST_DATE LST_TIME CRX_VN LONGITUDE LATITUDE ' - 'AIR_TEMPERATURE PRECIPITATION SOLAR_RADIATION SR_FLAG ' - 'SURFACE_TEMPERATURE ST_TYPE ST_FLAG RELATIVE_HUMIDITY RH_FLAG ' - 'SOIL_MOISTURE_5 SOIL_TEMPERATURE_5 WETNESS WET_FLAG WIND_1_5 WIND_FLAG' -) +HEADERS = [ + 'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN', + 'LONGITUDE', 'LATITUDE', 'AIR_TEMPERATURE', 'PRECIPITATION', + 'SOLAR_RADIATION', 'SR_FLAG', 'SURFACE_TEMPERATURE', 'ST_TYPE', 'ST_FLAG', + 'RELATIVE_HUMIDITY', 'RH_FLAG', 'SOIL_MOISTURE_5', 'SOIL_TEMPERATURE_5', + 'WETNESS', 'WET_FLAG', 'WIND_1_5', 'WIND_FLAG'] VARIABLE_MAP = { 'LONGITUDE': 'longitude', @@ -24,6 +23,21 @@ 'WIND_FLAG': 'wind_speed_flag' } +NAN_DICT = { + 'CRX_VN': -99999, + 'AIR_TEMPERATURE': -9999, + 'PRECIPITATION': -9999, + 'SOLAR_RADIATION': -99999, + 'SURFACE_TEMPERATURE': -9999, + 'RELATIVE_HUMIDITY': -9999, + 'SOIL_MOISTURE_5': -99, + 'SOIL_TEMPERATURE_5': -9999, + 'WETNESS': -9999, + 'WIND_1_5': -99} + +# Add NUL characters to possible NaN values for all columns +NAN_DICT = {k: [v, '\x00\x00\x00\x00\x00\x00'] for k, v in NAN_DICT.items()} + # as specified in CRN README.txt file. excludes 1 space between columns WIDTHS = [5, 8, 4, 8, 4, 6, 7, 7, 7, 7, 6, 1, 7, 1, 1, 5, 1, 7, 7, 5, 1, 6, 1] # add 1 to make fields contiguous (required by pandas.read_fwf) @@ -40,15 +54,22 @@ ] -def read_crn(filename): - """ - Read a NOAA USCRN fixed-width file into pandas dataframe. The CRN is - described in [1]_ and [2]_. +def read_crn(filename, map_variables=True): + """Read a NOAA USCRN fixed-width file into a pandas dataframe. + + The CRN network consists of over 100 meteorological stations covering the + U.S. and is described in [1]_ and [2]_. The primary goal of CRN is to + provide long-term measurements of temperature, precipitation, and soil + moisture and temperature. Additionally, global horizontal irradiance (GHI) + is measured at each site using a photodiode pyranometer. Parameters ---------- filename: str, path object, or file-like filepath or url to read for the fixed-width file. + map_variables: boolean, default: True + When true, renames columns of the Dataframe to pvlib variable names + where applicable. See variable :const:`VARIABLE_MAP`. Returns ------- @@ -60,12 +81,12 @@ def read_crn(filename): ----- CRN files contain 5 minute averages labeled by the interval ending time. Here, missing data is flagged as NaN, rather than the lowest - possible integer for a field (e.g. -999 or -99). Air temperature in - deg C. Wind speed in m/s at a height of 1.5 m above ground level. + possible integer for a field (e.g. -999 or -99). Air temperature is in + deg C and wind speed is in m/s at a height of 1.5 m above ground level. - Variables corresponding to standard pvlib variables are renamed, + Variables corresponding to standard pvlib variables are by default renamed, e.g. `SOLAR_RADIATION` becomes `ghi`. See the - `pvlib.iotools.crn.VARIABLE_MAP` dict for the complete mapping. + :const:`pvlib.iotools.crn.VARIABLE_MAP` dict for the complete mapping. CRN files occasionally have a set of null characters on a line instead of valid data. This function drops those lines. Sometimes @@ -85,16 +106,13 @@ def read_crn(filename): Amer. Meteor. Soc., 94, 489-498. :doi:`10.1175/BAMS-D-12-00170.1` """ - # read in data. set fields with NUL characters to NaN - data = pd.read_fwf(filename, header=None, names=HEADERS.split(' '), - widths=WIDTHS, na_values=['\x00\x00\x00\x00\x00\x00']) - # at this point we only have NaNs from NUL characters, not -999 etc. - # these bad rows need to be removed so that dtypes can be set. - # NaNs require float dtype so we run into errors if we don't do this. - data = data.dropna(axis=0) - # loop here because dtype kwarg not supported in read_fwf until 0.20 - for (col, _dtype) in zip(data.columns, DTYPES): - data[col] = data[col].astype(_dtype) + # read in data + data = pd.read_fwf(filename, header=None, names=HEADERS, widths=WIDTHS, + na_values=NAN_DICT) + # Remove rows with all nans + data = data.dropna(axis=0, how='all') + # set dtypes here because dtype kwarg not supported in read_fwf until 0.20 + data = data.astype(dict(zip(HEADERS, DTYPES))) # set index # UTC_TIME does not have leading 0s, so must zfill(4) to comply @@ -103,19 +121,8 @@ def read_crn(filename): dtindex = pd.to_datetime(dts['UTC_DATE'] + dts['UTC_TIME'].str.zfill(4), format='%Y%m%d%H%M', utc=True) data = data.set_index(dtindex) - try: - # to_datetime(utc=True) does not work in older versions of pandas - data = data.tz_localize('UTC') - except TypeError: - pass - - # Now we can set nans. This could be done a per column basis to be - # safer, since in principle a real -99 value could occur in a -9999 - # column. Very unlikely to see that in the real world. - for val in [-99, -999, -9999]: - # consider replacing with .replace([-99, -999, -9999]) - data = data.where(data != val, np.nan) - - data = data.rename(columns=VARIABLE_MAP) + + if map_variables: + data = data.rename(columns=VARIABLE_MAP) return data diff --git a/pvlib/tests/iotools/test_crn.py b/pvlib/tests/iotools/test_crn.py index f974e8961f..b19888dda1 100644 --- a/pvlib/tests/iotools/test_crn.py +++ b/pvlib/tests/iotools/test_crn.py @@ -3,11 +3,11 @@ from numpy import dtype, nan import pytest from pvlib.iotools import crn -from ..conftest import DATA_DIR, assert_frame_equal +from ..conftest import DATA_DIR, assert_frame_equal, assert_index_equal @pytest.fixture -def columns(): +def columns_mapped(): return [ 'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN', 'longitude', 'latitude', 'temp_air', 'PRECIPITATION', 'ghi', @@ -17,6 +17,16 @@ def columns(): 'WETNESS', 'WET_FLAG', 'wind_speed', 'wind_speed_flag'] +@pytest.fixture +def columns_unmapped(): + return [ + 'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN', + 'LONGITUDE', 'LATITUDE', 'AIR_TEMPERATURE', 'PRECIPITATION', + 'SOLAR_RADIATION', 'SR_FLAG', 'SURFACE_TEMPERATURE', 'ST_TYPE', + 'ST_FLAG', 'RELATIVE_HUMIDITY', 'RH_FLAG', 'SOIL_MOISTURE_5', + 'SOIL_TEMPERATURE_5', 'WETNESS', 'WET_FLAG', 'WIND_1_5', 'WIND_FLAG'] + + @pytest.fixture def dtypes(): return [ @@ -39,7 +49,7 @@ def testfile_problems(): return DATA_DIR / 'CRN_with_problems.txt' -def test_read_crn(testfile, columns, dtypes): +def test_read_crn(testfile, columns_mapped, dtypes): index = pd.DatetimeIndex(['2019-01-01 16:10:00', '2019-01-01 16:15:00', '2019-01-01 16:20:00', @@ -54,25 +64,31 @@ def test_read_crn(testfile, columns, dtypes): 0.0, 340.0, 0, 4.3, 'C', 0, 83.0, 0, nan, nan, 1183, 0, 0.53, 0], [53131, 20190101, 1625, 20190101, 925, 3, -111.17, 32.24, 4.0, 0.0, 393.0, 0, 4.8, 'C', 0, 81.0, 0, nan, nan, 1223, 0, 0.64, 0]]) - expected = pd.DataFrame(values, columns=columns, index=index) + expected = pd.DataFrame(values, columns=columns_mapped, index=index) for (col, _dtype) in zip(expected.columns, dtypes): expected[col] = expected[col].astype(_dtype) out = crn.read_crn(testfile) assert_frame_equal(out, expected) -def test_read_crn_problems(testfile_problems, columns, dtypes): +# Test map_variables=False returns correct column names +def test_read_crn_map_variables(testfile, columns_unmapped, dtypes): + out = crn.read_crn(testfile, map_variables=False) + assert_index_equal(out.columns, pd.Index(columns_unmapped)) + + +def test_read_crn_problems(testfile_problems, columns_mapped, dtypes): # GH1025 index = pd.DatetimeIndex(['2020-07-06 12:00:00', '2020-07-06 13:10:00'], freq=None).tz_localize('UTC') values = np.array([ - [92821, 20200706, 1200, 20200706, 700, '3', -80.69, 28.62, 24.9, - 0.0, 190.0, 0, 25.5, 'C', 0, 93.0, 0, nan, nan, 990, 0, 1.57, 0], + [92821, 20200706, 1200, 20200706, 700, '3.0', -80.69, 28.62, 24.9, + 0.0, np.nan, 0, 25.5, 'C', 0, 93.0, 0, nan, nan, 990, 0, 1.57, 0], [92821, 20200706, 1310, 20200706, 810, '2.623', -80.69, 28.62, 26.9, 0.0, 430.0, 0, 30.2, 'C', 0, 87.0, 0, nan, nan, 989, 0, 1.64, 0]]) - expected = pd.DataFrame(values, columns=columns, index=index) + expected = pd.DataFrame(values, columns=columns_mapped, index=index) for (col, _dtype) in zip(expected.columns, dtypes): expected[col] = expected[col].astype(_dtype) out = crn.read_crn(testfile_problems)