diff --git a/monetio/sat/__init__.py b/monetio/sat/__init__.py index adbe2029..5e9cd6ad 100644 --- a/monetio/sat/__init__.py +++ b/monetio/sat/__init__.py @@ -8,9 +8,11 @@ _tropomi_l2_no2_mm, goes, modis_ornl, - nesdis_edr_viirs, - nesdis_eps_viirs, + nesdis_avhrr_aot_aws_gridded, + nesdis_eps_viirs_aod_nrt, nesdis_frp, + nesdis_viirs_aod_aws_gridded, + nesdis_viirs_ndvi_aws_gridded, ) __all__ = [ @@ -23,9 +25,11 @@ "_tropomi_l2_no2_mm", "goes", "modis_ornl", - "nesdis_edr_viirs", - "nesdis_eps_viirs", + "nesdis_avhrr_aot_aws_gridded", + "nesdis_eps_viirs_aod_nrt", "nesdis_frp", + "nesdis_viirs_aod_aws_gridded", + "nesdis_viirs_ndvi_aws_gridded", ] __name__ = "sat" diff --git a/monetio/sat/nesdis_avhrr_aot_aws_gridded.py b/monetio/sat/nesdis_avhrr_aot_aws_gridded.py new file mode 100644 index 00000000..f7a71eb3 --- /dev/null +++ b/monetio/sat/nesdis_avhrr_aot_aws_gridded.py @@ -0,0 +1,169 @@ +def create_daily_aod_list(date_generated, fs, warning=False): + """ + Creates a list of daily AOD (Aerosol Optical Depth) files and calculates the total size of the files. + + Parameters: + date_generated (list): A list of dates for which to check the existence of AOD files. + fs (FileSystem): The file system object used to check file existence and size. + + Returns: + tuple: A tuple containing the list of file paths and the total size of the files. + """ + import warnings + + # Loop through observation dates & check for files + nodd_file_list = [] + nodd_total_size = 0 + for date in date_generated: + file_date = date.strftime("%Y%m%d") + year = file_date[:4] + prod_path = "noaa-cdr-aerosol-optical-thickness-pds/data/daily/" + year + "/" + patt = "AOT_AVHRR_*_daily-avg_" + file_names = fs.glob(prod_path + patt + file_date + "_*.nc") + # If file exists, add path to list and add file size to total + if file_names: + nodd_file_list.extend(file_names) + nodd_total_size = nodd_total_size + sum(fs.size(f) for f in file_names) + else: + msg = "File does not exist on AWS: " + prod_path + patt + file_date + "_*.nc" + if warning: + warnings.warn(msg) + nodd_file_list.append(None) + else: + raise ValueError(msg) + + return nodd_file_list, nodd_total_size + + +def create_monthly_aod_list(date_generated, fs, warning=False): + """ + Creates a list of daily AOD (Aerosol Optical Depth) files and calculates the total size of the files. + + Parameters: + date_generated (list): A list of dates for which to check the existence of AOD files. + fs (FileSystem): The file system object used to check file existence and size. + + Returns: + tuple: A tuple containing the list of file paths and the total size of the files. + """ + import warnings + + # Loop through observation dates & check for files + nodd_file_list = [] + nodd_total_size = 0 + for date in date_generated: + file_date = date.strftime("%Y%m%d") + year = file_date[:4] + prod_path = "noaa-cdr-aerosol-optical-thickness-pds/data/monthly/" + year + "/" + patt = "AOT_AVHRR_*_daily-avg_" + file_names = fs.glob(prod_path + patt + file_date + "_*.nc") + # If file exists, add path to list and add file size to total + if file_names: + nodd_file_list.extend(file_names) + nodd_total_size = nodd_total_size + sum(fs.size(f) for f in file_names) + else: + msg = "File does not exist on AWS: " + prod_path + patt + file_date + "_*.nc" + if warning: + warnings.warn(msg) + nodd_file_list.append(None) + else: + raise ValueError(msg) + + return nodd_file_list, nodd_total_size + + +def open_dataset(date, averaging_time="daily"): + """ + Opens a dataset for the given date, satellite, data resolution, and averaging time. + + Parameters: + date (str or datetime.datetime): The date for which to open the dataset. + averaging_time (str, optional): The averaging time. + Valid values are 'daily', or 'monthly'. Defaults to 'daily'. + + Returns: + xarray.Dataset: The opened dataset. + + Raises: + ValueError: If the input values are invalid. + """ + import pandas as pd + import s3fs + import xarray as xr + + if isinstance(date, str): + date_generated = [pd.Timestamp(date)] + else: + date_generated = [date] + + # Access AWS using anonymous credentials + fs = s3fs.S3FileSystem(anon=True) + + if averaging_time.lower() == "monthly": + file_list, _ = create_monthly_aod_list(date_generated, fs) + elif averaging_time.lower() == "daily": + file_list, _ = create_daily_aod_list(date_generated, fs) + else: + raise ValueError( + f"Invalid input for 'averaging_time' {averaging_time!r}: " + "Valid values are 'daily' or 'monthly'" + ) + + if len(file_list) == 0 or all(f is None for f in file_list): + raise ValueError(f"Files not available for product and date: {date_generated[0]}") + + aws_file = fs.open(file_list[0]) + + dset = xr.open_dataset(aws_file) + + return dset + + +def open_mfdataset(dates, averaging_time="daily", error_missing=False): + """ + Opens and combines multiple NetCDF files into a single xarray dataset. + + Parameters: + dates (pandas.DatetimeIndex): The dates for which to retrieve the data. + averaging_time (str, optional): The averaging time. + Valid values are 'daily', 'weekly', or 'monthly'. Defaults to 'daily'. + + Returns: + xarray.Dataset: The combined dataset containing the data for the specified dates. + + Raises: + ValueError: If the input parameters are invalid. + + """ + from collections.abc import Iterable + + import pandas as pd + import s3fs + import xarray as xr + + if isinstance(dates, Iterable) and not isinstance(dates, str): + dates = pd.DatetimeIndex(dates) + else: + dates = pd.DatetimeIndex([dates]) + + # Access AWS using anonymous credentials + fs = s3fs.S3FileSystem(anon=True) + + if averaging_time.lower() == "monthly": + file_list, _ = create_monthly_aod_list(dates, fs, warning=not error_missing) + elif averaging_time.lower() == "daily": + file_list, _ = create_daily_aod_list(dates, fs, warning=not error_missing) + else: + raise ValueError( + f"Invalid input for 'averaging_time' {averaging_time!r}: " + "Valid values are 'daily' or 'monthly'" + ) + + if len(file_list) == 0 or all(f is None for f in file_list): + raise ValueError(f"Files not available for product and dates: {dates}") + + aws_files = [fs.open(f) for f in file_list if f is not None] + + dset = xr.open_mfdataset(aws_files, concat_dim="time", combine="nested") + + return dset diff --git a/monetio/sat/nesdis_edr_viirs.py b/monetio/sat/nesdis_edr_viirs.py deleted file mode 100644 index 19512ea3..00000000 --- a/monetio/sat/nesdis_edr_viirs.py +++ /dev/null @@ -1,110 +0,0 @@ -import os - -import xarray as xr - -server = "ftp.star.nesdis.noaa.gov" -base_dir = "/pub/smcd/jhuang/npp.viirs.aerosol.data/edraot550/" - - -def open_dataset(date, resolution="high", datapath="."): - current = change_dir(datapath) # noqa: F841 - # check resolution; by default 0.1 degree data is assumed - if resolution in {"high", "h"}: - # this is the 0.1 degree data - nlat = 1800 - nlon = 3600 - lon, lat = _get_latlons(nlat, nlon) - fname, date = download_data(date, resolution="high") - else: - nlat = 720 - nlon = 1440 - lon, lat = _get_latlons(nlat, nlon) - fname, date = download_data(date, resolution=0.25) - # unzip fname - fname = _unzip_file(fname) - # read the data - data = read_data(fname, lat, lon, date) - return data - - -def open_mfdataset(dates, resolution="high", datapath="."): - das = [] - for i in dates: - das.append(open_dataset(i, resolution=resolution, datapath=datapath)) - ds = xr.concat(das, dim="time") - return ds - - -def read_data(fname, lat, lon, date): - from numpy import float32, fromfile, nan - from pandas import to_datetime - - f = fromfile(fname, dtype=float32) - nlat, nlon = lon.shape - aot = f.reshape(2, nlat, nlon)[0, :, :].reshape(1, nlat, nlon) - aot[aot < -999] = nan - datearr = to_datetime([date]) - da = xr.DataArray(aot, coords=[datearr, range(nlat), range(nlon)], dims=["time", "y", "x"]) - da["latitude"] = (("y", "x"), lat) - da["longitude"] = (("y", "x"), lon) - da.attrs["units"] = "" - da.name = "VIIRS EDR AOD" - da.attrs["long_name"] = "Aerosol Optical Depth" - da.attrs["source"] = ( - "ftp://ftp.star.nesdis.noaa.gov/pub/smcd/jhuang/npp.viirs.aerosol.data/edraot550" - ) - return da - - -def _unzip_file(fname): - import subprocess - - subprocess.run(["gunzip", "-f", fname]) - return fname[:-3] - - -def change_dir(to_path): - current = os.getcwd() - os.chdir(to_path) - return current - - -def download_data(date, resolution="high"): - import ftplib - from datetime import datetime - - if isinstance(date, datetime): - year = date.strftime("%Y") - yyyymmdd = date.strftime("%Y%m%d") - else: - from pandas import Timestamp - - date = Timestamp(date) - year = date.strftime("%Y") - yyyymmdd = date.strftime("%Y%m%d") - if resolution == "high": - file = f"npp_aot550_edr_gridded_0.10_{yyyymmdd}.high.bin.gz" - else: - file = f"npp_aot550_edr_gridded_0.25_{yyyymmdd}.high.bin.gz" - ftp = ftplib.FTP(server) - ftp.login() - # print(base_dir) - # print(year) - # print(base_dir + year) - ftp.cwd(base_dir + year) - # print(file) - ftp.retrbinary("RETR " + file, open(file, "wb").write) - return file, date - - -def _get_latlons(nlat, nlon): - from numpy import linspace, meshgrid - - lon_min = -179.875 - lon_max = -1 * lon_min - lat_min = -89.875 - lat_max = -1.0 * lat_min - lons = linspace(lon_min, lon_max, nlon) - lats = linspace(lat_min, lat_max, nlat) - lon, lat = meshgrid(lons, lats) - return lon, lat diff --git a/monetio/sat/nesdis_eps_viirs.py b/monetio/sat/nesdis_eps_viirs.py deleted file mode 100644 index 71dcd42c..00000000 --- a/monetio/sat/nesdis_eps_viirs.py +++ /dev/null @@ -1,192 +0,0 @@ -import os - -import xarray as xr - -server = "ftp.star.nesdis.noaa.gov" -base_dir = "/pub/smcd/VIIRS_Aerosol/npp.viirs.aerosol.data/epsaot550/" - - -def open_dataset(date, datapath="."): - """Short summary. - - Parameters - ---------- - date : type - Description of parameter `date`. - datapath : type - Description of parameter `datapath`. - - Returns - ------- - type - Description of returned object. - - """ - current = change_dir(datapath) - nlat = 720 - nlon = 1440 - lon, lat = _get_latlons(nlat, nlon) - if isinstance(date, str): - fname, date = download_data(date) - else: - fname, date = download_data(date) - print(fname) - data = read_data(fname, lat, lon, date) - change_dir(current) - return data.where(data > 0) - - -def open_mfdataset(dates, datapath="."): - """Short summary. - - Parameters - ---------- - dates : type - Description of parameter `dates`. - datapath : type - Description of parameter `datapath`. - - Returns - ------- - type - Description of returned object. - - """ - from xarray import concat - - das = [] - for i in dates: - print(i) - das.append(open_dataset(i, datapath=datapath)) - ds = concat(das, dim="time") - return ds - - -def read_data(fname, lat, lon, date): - """Short summary. - - Parameters - ---------- - fname : type - Description of parameter `fname`. - lat : type - Description of parameter `lat`. - lon : type - Description of parameter `lon`. - date : type - Description of parameter `date`. - - Returns - ------- - type - Description of returned object. - - """ - from pandas import to_datetime - - f = xr.open_dataset(fname) - datearr = to_datetime([date]) - da = f["aot_ip_out"] - da = da.rename({"nlat": "y", "nlon": "x"}) - da["latitude"] = (("y", "x"), lat) - da["longitude"] = (("y", "x"), lon) - da = da.expand_dims("time") - da["time"] = datearr - da.attrs["units"] = "" - da.name = "VIIRS EPS AOT" - da.attrs["long_name"] = "Aerosol Optical Thickness" - da.attrs["source"] = ( - "ftp://ftp.star.nesdis.noaa.gov/pub/smcd/VIIRS_Aerosol/npp.viirs.aerosol.data/epsaot550" - ) - return da - - -def change_dir(to_path): - """Short summary. - - Parameters - ---------- - to_path : type - Description of parameter `to_path`. - - Returns - ------- - type - Description of returned object. - - """ - current = os.getcwd() - os.chdir(to_path) - return current - - -def download_data(date, resolution="high"): - """Short summary. - - Parameters - ---------- - date : type - Description of parameter `date`. - resolution : type - Description of parameter `resolution`. - - Returns - ------- - type - Description of returned object. - - """ - import ftplib - from datetime import datetime - - from pandas import DatetimeIndex - - if isinstance(date, datetime) or isinstance(date, DatetimeIndex): - year = date.strftime("%Y") - yyyymmdd = date.strftime("%Y%m%d") - else: - from pandas import Timestamp - - date = Timestamp(date) - year = date.strftime("%Y") - yyyymmdd = date.strftime("%Y%m%d") - # npp_eaot_ip_gridded_0.25_20181222.high.nc - # print(year, yyyymmdd) - file = f"npp_eaot_ip_gridded_0.25_{yyyymmdd}.high.nc" - exists = os.path.isfile(file) - if ~exists: - ftp = ftplib.FTP(server) - ftp.login() - ftp.cwd(base_dir + year) - ftp.retrbinary("RETR " + file, open(file, "wb").write) - else: - print(f"File Already Exists! Reading: {file}") - return file, date - - -def _get_latlons(nlat, nlon): - """Short summary. - - Parameters - ---------- - nlat : type - Description of parameter `nlat`. - nlon : type - Description of parameter `nlon`. - - Returns - ------- - type - Description of returned object. - - """ - from numpy import linspace, meshgrid - - lon_min = -179.875 - lon_max = -1 * lon_min - lat_min = -89.875 - lat_max = -1.0 * lat_min - lons = linspace(lon_min, lon_max, nlon) - lats = linspace(lat_max, lat_min, nlat) - lon, lat = meshgrid(lons, lats) - return lon, lat diff --git a/monetio/sat/nesdis_eps_viirs_aod_nrt.py b/monetio/sat/nesdis_eps_viirs_aod_nrt.py new file mode 100644 index 00000000..fe094e2f --- /dev/null +++ b/monetio/sat/nesdis_eps_viirs_aod_nrt.py @@ -0,0 +1,173 @@ +import pandas as pd + + +def build_urls(dates, *, daily=True, data_resolution=0.1, satellite="NOAA20"): + """Construct URLs for downloading NEPS data. + + Parameters + ---------- + dates : pd.DatetimeIndex or iterable of datetime + Dates to download data for. + daily : bool, optional + Whether to download daily (default) or monthly data. + data_resolution : float, optional + Resolution of data in degrees (0.1 or 0.25). + satellite : str, optional + Satellite platform, 'SNPP' or 'NOAA20'. + + Returns + ------- + pd.Series + Series with URLs and corresponding file names. + + Notes + ----- + The `res` and `sat` parameters are only used for sub-daily data. + """ + import warnings + from collections.abc import Iterable + + if isinstance(dates, Iterable) and not isinstance(dates, str): + dates = pd.DatetimeIndex(dates) + else: + dates = pd.DatetimeIndex([dates]) + + if daily: + dates = dates.floor("D").unique() + else: # monthly + dates = dates.to_period("M").to_timestamp().unique() + + if data_resolution != 0.25 and not daily: + warnings.warn( + "Monthly data is only available at 0.25 deg resolution, " + f"got 'data_resolution' {data_resolution!r}" + ) + + sat_dirname = satellite.lower() + if satellite.upper() == "SNPP": + sat = "npp" if daily else "snpp" + elif satellite.upper() == "NOAA20": + sat = "noaa20" + res = str(data_resolution).ljust(5, "0") + aod_dirname = "aod/eps" if daily else "aod_monthly" + + urls = [] + fnames = [] + + print("Building VIIRS URLs...") + base_url = ( + "https://www.star.nesdis.noaa.gov/pub/smcd/VIIRS_Aerosol/viirs_aerosol_gridded_data/" + f"{sat_dirname}/{aod_dirname}/" + ) + + for date in dates: + if daily: + fname = "{}/viirs_eps_{}_aod_{}_deg_{}_nrt.nc".format( + date.strftime("%Y"), + sat, + res, + date.strftime("%Y%m%d"), + ) + else: + fname = "viirs_aod_monthly_{}_{}_deg_{}_nrt.nc".format( + sat, + res, + date.strftime("%Y%m"), + ) + url = base_url + fname + urls.append(url) + fnames.append(fname) + + # Note: files needed for comparison + urls = pd.Series(urls, index=None) + fnames = pd.Series(fnames, index=None) + + return urls, fnames + + +def open_dataset(date, *, satellite="NOAA20", data_resolution=0.1, daily=True): + """ + Parameters + ---------- + date : str or datetime-like + The date for which to open the dataset. + """ + from io import BytesIO + + import pandas as pd + import requests + import xarray as xr + + if not isinstance(date, pd.Timestamp): + d = pd.to_datetime(date) + else: + d = date + + if satellite.lower() not in ("noaa20", "snpp"): + raise ValueError( + f"Invalid input for 'satellite' {satellite!r}: " "Valid values are 'NOAA20' or 'SNPP'" + ) + + if data_resolution not in {0.1, 0.25}: + raise ValueError( + f"Invalid input for 'data_resolution' {data_resolution!r}: " + "Valid values are 0.1 or 0.25" + ) + + urls, _ = build_urls(d, satellite=satellite, data_resolution=data_resolution, daily=daily) + + r = requests.get(urls[0], stream=True) + r.raise_for_status() + dset = xr.open_dataset(BytesIO(r.content)) + + dset = dset.expand_dims(time=[d]).set_coords(["time"]) + + return dset + + +def open_mfdataset(dates, satellite="NOAA20", data_resolution=0.1, daily=True, error_missing=False): + import warnings + from collections.abc import Iterable + from io import BytesIO + + import pandas as pd + import requests + import xarray as xr + + if isinstance(dates, Iterable) and not isinstance(dates, str): + dates = pd.DatetimeIndex(dates) + else: + dates = pd.DatetimeIndex([dates]) + + if satellite.lower() not in ("noaa20", "snpp"): + raise ValueError( + f"Invalid input for 'satellite' {satellite!r}: " "Valid values are 'NOAA20' or 'SNPP'" + ) + + if data_resolution not in {0.1, 0.25}: + raise ValueError( + f"Invalid input for 'data_resolution' {data_resolution!r}: " + "Valid values are 0.1 or 0.25" + ) + + urls, _ = build_urls(dates, satellite=satellite, data_resolution=data_resolution, daily=daily) + + dsets = [] + for url, date in zip(urls, dates): + r = requests.get(url, stream=True) + if r.status_code != 200: + msg = f"Failed to access file on NESDIS FTP server: {url}" + if error_missing: + raise RuntimeError(msg) + else: + warnings.warn(msg) + else: + ds = xr.open_dataset(BytesIO(r.content)).expand_dims(time=[date]).set_coords(["time"]) + dsets.append(ds) + + if len(dsets) == 0: + raise ValueError(f"Files not available for product and dates: {dates}") + + dset = xr.concat(dsets, dim="time") + + return dset diff --git a/monetio/sat/nesdis_viirs_aod_aws_gridded.py b/monetio/sat/nesdis_viirs_aod_aws_gridded.py new file mode 100644 index 00000000..23e74a7a --- /dev/null +++ b/monetio/sat/nesdis_viirs_aod_aws_gridded.py @@ -0,0 +1,315 @@ +def create_daily_aod_list(data_resolution, satellite, date_generated, fs, warning=False): + """ + Creates a list of daily AOD (Aerosol Optical Depth) files and calculates the total size of the files. + + Parameters: + data_resolution (str): The resolution of the AOD data. + satellite (str): The satellite name. Can be 'SNPP' or 'NOAA20'. + date_generated (list): A list of dates for which to check the existence of AOD files. + fs (FileSystem): The file system object used to check file existence and size. + + Returns: + tuple: A tuple containing the list of file paths and the total size of the files. + """ + import warnings + + # Loop through observation dates & check for files + nodd_file_list = [] + nodd_total_size = 0 + for date in date_generated: + file_date = date.strftime("%Y%m%d") + year = file_date[:4] + + if satellite == "SNPP": + sat_name = "npp" + elif satellite == "NOAA20": + sat_name = "noaa20" + file_name = ( + "viirs_eps_" + sat_name + "_aod_" + data_resolution + "_deg_" + file_date + ".nc" + ) + prod_path = ( + "noaa-jpss/" + + satellite + + "/VIIRS/" + + satellite + + "_VIIRS_Aerosol_Optical_Depth_Gridded_Reprocessed/" + + data_resolution[:4] + + "_Degrees_Daily/" + + year + + "/" + ) + # If file exists, add path to list and add file size to total + if fs.exists(prod_path + file_name) is True: + nodd_file_list.extend(fs.ls(prod_path + file_name)) + nodd_total_size = nodd_total_size + fs.size(prod_path + file_name) + else: + msg = "File does not exist on AWS: " + prod_path + file_name + if warning: + warnings.warn(msg, stacklevel=2) + nodd_file_list.append(None) + else: + raise ValueError(msg) + + return nodd_file_list, nodd_total_size + + +# Create list of available monthly data file paths & total size of files +def create_monthly_aod_list(satellite, date_generated, fs, warning=False): + """ + Creates a list of monthly AOD (Aerosol Optical Depth) files for a given satellite and date range. + + Args: + satellite (str): The satellite name. Can be 'SNPP' or 'NOAA20'. + date_generated (list): A list of datetime objects representing the observation dates. + fs: The file system object used to check for file existence and retrieve file information. + + Returns: + tuple: A tuple containing the list of file paths and the total size of the files. + """ + import warnings + + # Loop through observation dates & check for files + nodd_file_list = [] + nodd_total_size = 0 + year_month_list = [] + for date in date_generated: + file_date = date.strftime("%Y%m%d") + year_month = file_date[:6] + if year_month not in year_month_list: + year_month_list.append(year_month) + + if satellite == "SNPP": + sat_name = "snpp" + elif satellite == "NOAA20": + sat_name = "noaa20" + file_name = "viirs_aod_monthly_" + sat_name + "_0.250_deg_" + year_month + ".nc" + prod_path = ( + "noaa-jpss/" + + satellite + + "/VIIRS/" + + satellite + + "_VIIRS_Aerosol_Optical_Depth_Gridded_Reprocessed/0.25_Degrees_Monthly/" + ) + # If file exists, add path to list and add file size to total + if fs.exists(prod_path + file_name) is True: + nodd_file_list.extend(fs.ls(prod_path + file_name)) + nodd_total_size = nodd_total_size + fs.size(prod_path + file_name) + else: + msg = "File does not exist on AWS: " + prod_path + file_name + if warning: + warnings.warn(msg, stacklevel=2) + nodd_file_list.append(None) + else: + raise ValueError(msg) + + return nodd_file_list, nodd_total_size + + +# Create list of available weekly data file paths & total size of files +def create_weekly_aod_list(satellite, date_generated, fs, warning=False): + """ + Creates a list of files and calculates the total size of files for a given satellite, observation dates, and file system. + + Parameters: + satellite (str): The satellite name. Can be 'SNPP' or 'NOAA20'. + date_generated (list): A list of observation dates. + fs (FileSystem): The file system object. + + Returns: + tuple: A tuple containing the list of files and the total size of files. + """ + # Loop through observation dates & check for files + nodd_file_list = [] + nodd_total_size = 0 + for date in date_generated: + file_date = date.strftime("%Y%m%d") + year = file_date[:4] + + prod_path = ( + "noaa-jpss/" + + satellite + + "/VIIRS/" + + satellite + + "_VIIRS_Aerosol_Optical_Depth_Gridded_Reprocessed/0.25_Degrees_Weekly/" + + year + + "/" + ) + # Get list of all files in given year on NODD + all_files = fs.ls(prod_path) + # Loop through files, check if file date falls within observation date range + for file in all_files: + file_start = file.split("/")[-1].split("_")[7].split(".")[0].split("-")[0] + file_end = file.split("/")[-1].split("_")[7].split(".")[0].split("-")[1] + # If file within observation range, add path to list and add file size to total + if file_date >= file_start and file_date <= file_end: + if file not in nodd_file_list: + nodd_file_list.append(file) + nodd_total_size = nodd_total_size + fs.size(file) + + return nodd_file_list, nodd_total_size + + +def open_dataset(date, satellite="SNPP", data_resolution=0.1, averaging_time="daily"): + """Load VIIRS AOD data from AWS + for the given date, satellite, data resolution, and averaging time. + + Parameters: + date (str or datetime-like): The date for which to open the dataset. + SNPP has data from 2012-01-19 to 2020-12-31. + NOAA20 has data from 2018-01-01 to 2020-12-31. + satellite (str): The satellite to retrieve data from. + Valid values are 'SNPP' or 'NOAA20'. + data_resolution (float or str, optional): The data resolution. + Valid values are '0.050', '0.100', or '0.250'. Defaults to 0.1°. + Only has an effect when `averaging_time` is 'daily'. + For 'weekly' and 'monthly' data, the resolution is always 0.25. + averaging_time (str, optional): The averaging time. + Valid values are 'daily', 'weekly', or 'monthly'. Defaults to 'daily'. + + Returns: + xarray.Dataset: The opened dataset. + + Raises: + ValueError: If the input parameters are invalid. + """ + import pandas as pd + import s3fs + import xarray as xr + + if satellite not in {"SNPP", "NOAA20"}: + raise ValueError( + f"Invalid input for 'satellite' {satellite!r}: Valid values are 'SNPP' or 'NOAA20'" + ) + + data_resolution_in = data_resolution + data_resolution = str(data_resolution).ljust(5, "0") + if data_resolution not in {"0.050", "0.100", "0.250"}: + raise ValueError( + f"Invalid input for 'data_resolution' {data_resolution_in!r}: " + "Valid values are '0.050', '0.100', or '0.250'" + ) + + if isinstance(date, str): + date_generated = [pd.Timestamp(date)] + else: + date_generated = [date] + + # Access AWS using anonymous credentials + fs = s3fs.S3FileSystem(anon=True) + + if averaging_time.lower() == "monthly": + file_list, _ = create_monthly_aod_list(satellite, date_generated, fs, warning=False) + elif averaging_time.lower() == "weekly": + file_list, _ = create_weekly_aod_list(satellite, date_generated, fs, warning=False) + elif averaging_time.lower() == "daily": + file_list, _ = create_daily_aod_list( + data_resolution, satellite, date_generated, fs, warning=False + ) + else: + raise ValueError( + f"Invalid input for 'averaging_time' {averaging_time!r}: " + "Valid values are 'daily', 'weekly', or 'monthly'" + ) + + if len(file_list) == 0 or all(f is None for f in file_list): + raise ValueError(f"Files not available for product and date: {date_generated[0]}") + + aws_file = fs.open(file_list[0]) + + dset = xr.open_dataset(aws_file) + + # Add datetime + dset = dset.expand_dims(time=date_generated).set_coords(["time"]) + + return dset + + +def open_mfdataset( + dates, satellite="SNPP", data_resolution=0.1, averaging_time="daily", error_missing=False +): + """ + Opens and combines multiple NetCDF files into a single xarray dataset. + + Parameters: + dates (pandas.DatetimeIndex): The dates for which to retrieve the data. + SNPP has data from 2012-01-19 to 2020-12-31. + NOAA20 has data from 2018-01-01 to 2020-12-31. + satellite (str): The satellite name. + Valid values are 'SNPP' or 'NOAA20'. + data_resolution (float or str, optional): The data resolution. + Valid values are '0.050', '0.100', or '0.250'. Defaults to 0.1°. + Only has an effect when `averaging_time` is 'daily'. + For 'weekly' and 'monthly' data, the resolution is always 0.25. + averaging_time (str, optional): The averaging time. + Valid values are 'daily', 'weekly', or 'monthly'. Defaults to 'daily'. + error_missing (bool, optional): If False (default), skip missing files with warning + and continue processing. Otherwise, raise an error. + + Returns: + xarray.Dataset: The combined dataset containing the data for the specified dates. + + Raises: + ValueError: If the input parameters are invalid. + """ + from collections.abc import Iterable + + import pandas as pd + import s3fs + import xarray as xr + + if satellite not in {"SNPP", "NOAA20"}: + raise ValueError( + f"Invalid input for 'satellite' {satellite!r}: Valid values are 'SNPP' or 'NOAA20'" + ) + + data_resolution_in = data_resolution + data_resolution = str(data_resolution).ljust(5, "0") + if data_resolution not in {"0.050", "0.100", "0.250"}: + raise ValueError( + f"Invalid input for 'data_resolution' {data_resolution_in!r}: " + "Valid values are '0.050', '0.100', or '0.250'" + ) + + if isinstance(dates, Iterable) and not isinstance(dates, str): + dates = pd.DatetimeIndex(dates) + else: + dates = pd.DatetimeIndex([dates]) + + # Access AWS using anonymous credentials + fs = s3fs.S3FileSystem(anon=True) + + if averaging_time.lower() == "monthly": + file_list, _ = create_monthly_aod_list(satellite, dates, fs, warning=not error_missing) + elif averaging_time.lower() == "weekly": + file_list, _ = create_weekly_aod_list(satellite, dates, fs, warning=not error_missing) + elif averaging_time.lower() == "daily": + file_list, _ = create_daily_aod_list( + data_resolution, satellite, dates, fs, warning=not error_missing + ) + else: + raise ValueError( + f"Invalid input for 'averaging_time' {averaging_time!r}: " + "Valid values are 'daily', 'weekly', or 'monthly'" + ) + + if len(file_list) == 0 or all(f is None for f in file_list): + raise ValueError(f"Files not available for product and dates: {dates}") + + if not len(file_list) == len(dates): + raise ValueError( + "'dates' and discovered file list are not the same length. " + "Consider the time frequency ('averaging_time') when constructing your dates input." + ) + + dates_good = [] + aws_files = [] + for d, f in zip(dates, file_list): + if f is not None: + aws_files.append(fs.open(f)) + dates_good.append(d) + + dset = xr.open_mfdataset(aws_files, concat_dim="time", combine="nested") + + dset["time"] = dates_good + + return dset diff --git a/monetio/sat/nesdis_viirs_ndvi_aws_gridded.py b/monetio/sat/nesdis_viirs_ndvi_aws_gridded.py new file mode 100644 index 00000000..746a363e --- /dev/null +++ b/monetio/sat/nesdis_viirs_ndvi_aws_gridded.py @@ -0,0 +1,140 @@ +def create_daily_vhi_list(date_generated, fs, warning=False): + """ + Creates a list of daily vhi (Vegetative Health Index) files and calculates the total size of the files. + + Parameters: + date_generated (list): A list of dates for which to check the existence of AOD files. + fs (FileSystem): The file system object used to check file existence and size. + + Returns: + tuple: A tuple containing the list of file paths and the total size of the files. + """ + import warnings + + # Loop through observation dates & check for files + nodd_file_list = [] + nodd_total_size = 0 + for date in date_generated: + file_date = date.strftime("%Y%m%d") + year = file_date[:4] + prod_path = "noaa-cdr-ndvi-pds/data/" + year + "/" + patt = "VIIRS-Land_*_" + file_names = fs.glob(prod_path + patt + file_date + "_*.nc") + # If file exists, add path to list and add file size to total + if file_names: + nodd_file_list.extend(file_names) + nodd_total_size = nodd_total_size + sum(fs.size(f) for f in file_names) + else: + msg = "File does not exist on AWS: " + prod_path + patt + file_date + "_*.nc" + if warning: + warnings.warn(msg) + nodd_file_list.append(None) + else: + raise ValueError(msg) + + return nodd_file_list, nodd_total_size + + +def open_dataset(date): + """ + Opens a dataset for the given date. + + Parameters: + date (str or datetime-like): The date for which to open the dataset. + 1981--present are available. + + Returns: + xarray.Dataset: The opened dataset. + + Raises: + ValueError: If the input parameters are invalid. + """ + import pandas as pd + import s3fs + import xarray as xr + + if isinstance(date, str): + date_generated = [pd.Timestamp(date)] + else: + date_generated = [date] + + # Access AWS using anonymous credentials + fs = s3fs.S3FileSystem(anon=True) + + file_list, _ = create_daily_vhi_list(date_generated, fs) + + if len(file_list) == 0 or all(f is None for f in file_list): + raise ValueError(f"Files not available for product and date: {date_generated[0]}") + + aws_file = fs.open(file_list[0]) + + dset = xr.open_dataset(aws_file, decode_cf=False) + + # Deal with TIMEOFDAY variable manually to avoid warnings + m = dset["TIMEOFDAY"].attrs.pop("scale_factor") # 0.01 + b = dset["TIMEOFDAY"].attrs.pop("add_offset") # 0 + fv = dset["TIMEOFDAY"].attrs.pop("_FillValue") # -9999 + dset["TIMEOFDAY"] = dset["TIMEOFDAY"] * m + b + dset["TIMEOFDAY"].attrs.update(units="hours") # -> auto timedelta conversion + dset = xr.decode_cf(dset) + dset["TIMEOFDAY"] = dset["TIMEOFDAY"].where( + dset["TIMEOFDAY"] != pd.Timedelta(fv * m + b, unit="hours") + ) + + return dset + + +def open_mfdataset(dates, error_missing=False): + """ + Opens and combines multiple NetCDF files into a single xarray dataset. + + Parameters: + dates (pandas.DatetimeIndex): The dates for which to retrieve the data. + error_missing (bool, optional): If False (default), skip missing files with warning + and continue processing. Otherwise, raise an error. + Returns: + xarray.Dataset: The combined dataset containing the data for the specified dates. + + Raises: + ValueError: If the input parameters are invalid. + """ + from collections.abc import Iterable + + import pandas as pd + import s3fs + import xarray as xr + + if isinstance(dates, Iterable) and not isinstance(dates, str): + dates = pd.DatetimeIndex(dates) + else: + dates = pd.DatetimeIndex([dates]) + + # Access AWS using anonymous credentials + fs = s3fs.S3FileSystem(anon=True) + + file_list, _ = create_daily_vhi_list(dates, fs, warning=not error_missing) + + if len(file_list) == 0 or all(f is None for f in file_list): + raise ValueError(f"Files not available for product and dates: {dates}") + + aws_files = [fs.open(f) for f in file_list if f is not None] + + dset = xr.open_mfdataset( + aws_files, + concat_dim="time", + combine="nested", + decode_cf=False, + ) + + # Deal with TIMEOFDAY variable manually to avoid warnings + m = dset["TIMEOFDAY"].attrs.pop("scale_factor") # 0.01 + b = dset["TIMEOFDAY"].attrs.pop("add_offset") # 0 + fv = dset["TIMEOFDAY"].attrs.pop("_FillValue") # -9999 + dset["TIMEOFDAY"] = dset["TIMEOFDAY"] * m + b + dset["TIMEOFDAY"].attrs.update(units="hours") # -> auto timedelta conversion + dset = xr.decode_cf(dset) + dset["TIMEOFDAY"] = dset["TIMEOFDAY"].where( + dset["TIMEOFDAY"] != pd.Timedelta(fv * m + b, unit="hours") + ) + + return dset diff --git a/tests/test_avhrr_aot.py b/tests/test_avhrr_aot.py new file mode 100644 index 00000000..3ced3050 --- /dev/null +++ b/tests/test_avhrr_aot.py @@ -0,0 +1,49 @@ +import sys + +import pandas as pd +import pytest + +from monetio.sat.nesdis_avhrr_aot_aws_gridded import open_dataset, open_mfdataset + +if sys.version_info < (3, 7): + pytest.skip("s3fs requires Python 3.7+", allow_module_level=True) + + +def test_open_dataset_no_data(): + with pytest.raises(ValueError, match="File does not exist on AWS:"): + open_dataset("1900-01-01") + + +def test_open_dataset(): + date = "2023-01-01" + ds = open_dataset(date) + assert set(ds.dims) >= {"time", "latitude", "longitude"} + assert ds.sizes["time"] == 1 + assert ds.sizes["latitude"] == 1800 + assert ds.sizes["longitude"] == 3600 + assert ds["time"] == pd.to_datetime(date) + assert "aot1" in ds.data_vars + assert ds["aot1"].dims == ("time", "latitude", "longitude") + + +def test_open_mfdataset(): + dates = ["2023-01-01", "2023-01-02"] + ds = open_mfdataset(dates) + assert (ds["time"] == pd.DatetimeIndex(dates)).all() + + +def test_open_mfdataset_error(): + dates = ["1900-01-01", "2023-01-01"] + + with pytest.warns(UserWarning, match="File does not exist on AWS:"): + ds = open_mfdataset(dates) + assert ds.sizes["time"] == 1 + assert ds["time"] == pd.to_datetime(dates[-1]) + + with pytest.raises(ValueError, match="File does not exist on AWS:"): + _ = open_mfdataset(dates, error_missing=True) + + with pytest.raises(ValueError, match="Files not available for product and dates"), pytest.warns( + UserWarning, match="File does not exist on AWS:" + ): + _ = open_mfdataset(dates[:1], error_missing=False) diff --git a/tests/test_viirs_aod.py b/tests/test_viirs_aod.py new file mode 100644 index 00000000..2a841d72 --- /dev/null +++ b/tests/test_viirs_aod.py @@ -0,0 +1,130 @@ +import sys + +import pandas as pd +import pytest + +from monetio.sat.nesdis_viirs_aod_aws_gridded import open_dataset, open_mfdataset + +if sys.version_info < (3, 7): + pytest.skip("s3fs requires Python 3.7+", allow_module_level=True) + + +@pytest.mark.parametrize("sat", ["SNPP", "NOAA20"]) +@pytest.mark.parametrize("res", [0.05, 0.1, 0.25]) +def test_open_dataset_daily(sat, res): + date = "2020-01-01" # a date when we have both SNPP and NOAA-20 data available + s_res = f"{res:.3f}" + + ds = open_dataset(date, sat, s_res) + assert set(ds.dims) == {"time", "lat", "lon"} + assert ds.sizes["time"] == 1 + assert ds.sizes["lat"] == int(180 / res) + assert ds.sizes["lon"] == int(360 / res) + assert ds.attrs["satellite_name"] == ("NPP" if sat == "SNPP" else "NOAA 20") + assert ds.attrs["spatial_resolution"].strip().startswith(str(res)) + assert (ds.time == pd.DatetimeIndex([date])).all() + + +def test_open_dataset_bad_input(): + with pytest.raises(ValueError, match="Invalid input"): + open_dataset("2020-01-01", satellite="GOES-16") + + with pytest.raises(ValueError, match="Invalid input"): + open_dataset("2020-01-01", satellite="both") + + with pytest.raises(ValueError, match="Invalid input"): + open_dataset("2020-01-01", data_resolution=100) + + with pytest.raises(ValueError, match="Invalid input"): + open_dataset("2020-01-01", averaging_time="asdf") + + +def test_open_dataset_no_data(): + with pytest.raises(ValueError, match="File does not exist on AWS:"): + open_dataset("1900-01-01") + + +def test_open_mfdataset_bad_input(): + cases = [ + {"satellite": "GOES-16"}, + {"satellite": "both"}, + {"data_resolution": 100}, + {"averaging_time": "asdf"}, + ] + for case in cases: + with pytest.raises(ValueError, match="Invalid input"): + open_mfdataset(["2020-01-01"], **case) + + +def test_open_mfdataset_daily(): + ds = open_mfdataset(["2020-01-01", "2020-01-02"], satellite="SNPP", data_resolution=0.25) + assert set(ds.dims) == {"time", "lat", "lon"} + assert ds.sizes["time"] == 2 + assert ds.attrs["spatial_resolution"].strip().startswith("0.25") + assert (ds.time == pd.DatetimeIndex(["2020-01-01", "2020-01-02"])).all() + + +def test_open_mfdataset_monthly(): + with pytest.raises(ValueError, match="not the same length"): + open_mfdataset(["2020-01-01", "2020-01-02"], averaging_time="monthly") + + months = pd.date_range(start="2020-01-01", freq="MS", periods=2) + ds = open_mfdataset(months, averaging_time="monthly") + assert ds.sizes["time"] == 2 + + +def test_open_mfdataset_daily_warning(): + dates = ["2012-01-18", "2012-01-19"] # 2012-01-19 is the first available date + + # Warn and skip by default + with pytest.warns(match="File does not exist on AWS:"): + ds = open_mfdataset( + dates, + satellite="SNPP", + data_resolution=0.25, + averaging_time="daily", + ) + assert ds.sizes["time"] == 1 + assert (ds.time == pd.DatetimeIndex([dates[1]])).all() + + # Error optionally + with pytest.raises(ValueError, match="File does not exist on AWS:"): + ds = open_mfdataset( + dates, + satellite="SNPP", + data_resolution=0.25, + averaging_time="daily", + error_missing=True, + ) + + +def test_open_mfdataset_monthly_warning(): + dates = ["2011-12-01", "2012-01-01"] + + # Warn and skip by default + with pytest.warns(match="File does not exist on AWS:"): + ds = open_mfdataset( + dates, + satellite="SNPP", + data_resolution=0.25, + averaging_time="monthly", + ) + assert ds.sizes["time"] == 1 + assert (ds.time == pd.DatetimeIndex([dates[1]])).all() + + # Error optionally + with pytest.raises(ValueError, match="File does not exist on AWS:"): + ds = open_mfdataset( + dates, + satellite="SNPP", + data_resolution=0.25, + averaging_time="monthly", + error_missing=True, + ) + + +def test_open_mfdataset_no_data(): + with pytest.raises(ValueError, match="Files not available"), pytest.warns( + match="File does not exist on AWS:" + ): + open_mfdataset(["1900-01-01"]) diff --git a/tests/test_viirs_aod_nrt.py b/tests/test_viirs_aod_nrt.py new file mode 100644 index 00000000..d07e8608 --- /dev/null +++ b/tests/test_viirs_aod_nrt.py @@ -0,0 +1,84 @@ +import warnings + +import pandas as pd +import pytest + +from monetio.sat.nesdis_eps_viirs_aod_nrt import open_dataset, open_mfdataset + +NOW = pd.Timestamp.now("UTC") +TODAY = NOW.floor("D") + +with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Converting to Period representation will drop timezone information.", + ) + THIS_MONTH = TODAY.to_period("M").to_timestamp() + +LAST_MONTH = THIS_MONTH - pd.DateOffset(months=1) +LAST_LAST_MONTH = LAST_MONTH - pd.DateOffset(months=1) + + +@pytest.mark.parametrize("res", [0.25, 0.1]) +@pytest.mark.parametrize("sat", ["NOAA20", "SNPP"]) +def test_open_dataset_daily(sat, res): + # Note: only NRT + date = (TODAY - pd.Timedelta(days=2)).tz_localize(None) + ds = open_dataset(date, satellite=sat, data_resolution=res) + + assert date.strftime(r"%Y%m%d") in ds.attrs["dataset_name"] + assert ds.attrs["spatial_resolution"].strip() == f"{res:.2f} degree" + assert ds.attrs["satellite_name"] == ("Suomi NPP" if sat == "SNPP" else "NOAA 20") + + assert set(ds.dims) == {"time", "lat", "lon"} + assert ds.sizes["time"] == 1 + assert ds.sizes["lat"] == int(180 / res) + assert ds.sizes["lon"] == int(360 / res) + assert (ds.time == pd.DatetimeIndex([date])).all() + assert "AOD550" in ds.data_vars + + +@pytest.mark.parametrize("sat", ["NOAA20", "SNPP"]) +def test_open_dataset_monthly(sat): + # Seems like only one is stored + if NOW - THIS_MONTH.tz_localize("UTC") > pd.Timedelta(hours=12): + date = LAST_MONTH + else: + date = LAST_LAST_MONTH + + ds = open_dataset(date, satellite=sat, daily=False, data_resolution=0.25) + assert ds.sizes["time"] == 1 + + +def test_open_mfdataset(): + today = TODAY.tz_localize(None) + dates = [today - pd.Timedelta(days=2), today - pd.Timedelta(days=3)] + ds = open_mfdataset(dates) + assert ds.sizes["time"] == len(dates) + + +def test_missing_date(): + from requests.exceptions import HTTPError + + with pytest.raises(HTTPError): + open_dataset("1900-01-01") + + +def test_missing_date_mf(): + # No dsets collected + with pytest.raises(ValueError, match="Files not available for product and dates"), pytest.warns( + UserWarning, match="Failed to access file" + ): + open_mfdataset("1900-01-01") + + # Error during dsets collection + with pytest.raises(RuntimeError, match="Failed to access file"): + open_mfdataset("1900-01-01", error_missing=True) + + one_good = ["1900-01-01", TODAY.tz_localize(None) - pd.Timedelta(days=2)] + with pytest.warns(UserWarning, match="Failed to access file"): + ds = open_mfdataset(one_good) + assert ds.sizes["time"] == 1 + + with pytest.raises(RuntimeError, match="Failed to access file"): + open_mfdataset(one_good, error_missing=True) diff --git a/tests/test_viirs_ndvi.py b/tests/test_viirs_ndvi.py new file mode 100644 index 00000000..b599f53c --- /dev/null +++ b/tests/test_viirs_ndvi.py @@ -0,0 +1,61 @@ +import sys + +import numpy as np +import pandas as pd +import pytest + +from monetio.sat.nesdis_viirs_ndvi_aws_gridded import open_dataset, open_mfdataset + +if sys.version_info < (3, 7): + pytest.skip("s3fs requires Python 3.7+", allow_module_level=True) + + +def test_open_dataset_no_data(): + with pytest.raises(ValueError, match="File does not exist on AWS:"): + open_dataset("1900-01-01") + + +def test_open_dataset(): + date = "2023-01-01" + ds = open_dataset(date) + + assert set(ds.dims) >= {"time", "latitude", "longitude"} + assert ds.sizes["time"] == 1 + assert ds.sizes["latitude"] == 3600 + assert ds.sizes["longitude"] == 7200 + assert ds["time"] == pd.to_datetime(date) + + assert "NDVI" in ds.data_vars + assert ds["NDVI"].dims == ("time", "latitude", "longitude") + + assert np.abs(ds["NDVI"]).max().item() < 5 + q = 0.02 + a, b = ds["NDVI"].quantile((q, 1 - q)).values + assert -1 < a < b < 1 + + assert ds["TIMEOFDAY"].isnull().sum() > 0 + assert ds["TIMEOFDAY"].to_series().min() >= pd.Timedelta("0h") + assert ds["TIMEOFDAY"].to_series().max() < pd.Timedelta("24h") + + +def test_open_mfdataset(): + dates = ["2023-01-01", "2023-01-02"] + ds = open_mfdataset(dates) + assert (ds["time"] == pd.DatetimeIndex(dates)).all() + + +def test_open_mfdataset_error(): + dates = ["1900-01-01", "2023-01-01"] + + with pytest.warns(UserWarning, match="File does not exist on AWS:"): + ds = open_mfdataset(dates) + assert ds.sizes["time"] == 1 + assert ds["time"] == pd.to_datetime(dates[-1]) + + with pytest.raises(ValueError, match="File does not exist on AWS:"): + _ = open_mfdataset(dates, error_missing=True) + + with pytest.raises(ValueError, match="Files not available for product and dates"), pytest.warns( + UserWarning, match="File does not exist on AWS:" + ): + _ = open_mfdataset(dates[:1], error_missing=False)