From fa4b376b9877666a383702a17c16af83c8ab6ff2 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 11 Jul 2023 15:46:49 -0600 Subject: [PATCH 1/8] Add failing test --- tests/test_aeronet.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_aeronet.py b/tests/test_aeronet.py index bf380403..938f198f 100644 --- a/tests/test_aeronet.py +++ b/tests/test_aeronet.py @@ -220,3 +220,10 @@ def test_interp_daily_with_pytspack(): df = aeronet.add_data(dates, daily=True, n_procs=1, interp_to_aod_values=standard_wavelengths) assert {f"aod_{int(wl)}nm" for wl in standard_wavelengths}.issubset(df.columns) + + +def test_issue100(): + dates = pd.date_range(start="2019-09-01", end="2019-09-2", freq="H") + df1 = aeronet.add_data(dates, n_procs=1) + df2 = aeronet.add_data(dates, n_procs=2) + assert len(df1) == len(df2) From 307799bacd025475d0cd5f28613442dac2048345 Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 11 Jul 2023 15:53:27 -0600 Subject: [PATCH 2/8] Sort of fix But will fail if day or less requested --- monetio/obs/aeronet.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/monetio/obs/aeronet.py b/monetio/obs/aeronet.py index 853c93c0..b77d965e 100644 --- a/monetio/obs/aeronet.py +++ b/monetio/obs/aeronet.py @@ -151,11 +151,10 @@ def add_data( # Split up by day min_date = dates.min() max_date = dates.max() - days = pd.date_range(start=min_date, end=max_date, freq="D") # TODO: subtract 1? - days1 = days + pd.Timedelta(days=1) + days = pd.date_range(start=min_date, end=max_date, freq="D") dfs = Parallel(n_jobs=n_procs, verbose=verbose)( delayed(_parallel_aeronet_call)(pd.DatetimeIndex([d1, d2]), **kwargs, freq=None) - for d1, d2 in zip(days, days1) + for d1, d2 in zip(days[:-1], days[1:]) ) df = pd.concat(dfs, ignore_index=True).drop_duplicates() if freq is not None: From 44b6fca885041638f0c55ed9d63c59c312e7f1cd Mon Sep 17 00:00:00 2001 From: zmoon Date: Tue, 11 Jul 2023 16:36:52 -0600 Subject: [PATCH 3/8] WIP In the two day case, the parallel run has somewhat fewer rows Removing the `.drop_duplicates()` didn't change things Will have to investigate more --- monetio/obs/aeronet.py | 18 ++++++++++++------ tests/test_aeronet.py | 22 +++++++++++++++++++--- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/monetio/obs/aeronet.py b/monetio/obs/aeronet.py index b77d965e..a2024495 100644 --- a/monetio/obs/aeronet.py +++ b/monetio/obs/aeronet.py @@ -147,14 +147,20 @@ def add_data( ) requested_parallel = n_procs > 1 or n_procs == -1 - if has_joblib and requested_parallel: - # Split up by day + + # Split up by day + dates = pd.to_datetime(dates) + if dates is not None: min_date = dates.min() max_date = dates.max() - days = pd.date_range(start=min_date, end=max_date, freq="D") + time_bounds = pd.date_range(start=min_date, end=max_date, freq="D") + if max_date not in time_bounds: + time_bounds = time_bounds.append(pd.DatetimeIndex([max_date])) + + if has_joblib and requested_parallel and dates is not None and len(time_bounds) > 2: dfs = Parallel(n_jobs=n_procs, verbose=verbose)( - delayed(_parallel_aeronet_call)(pd.DatetimeIndex([d1, d2]), **kwargs, freq=None) - for d1, d2 in zip(days[:-1], days[1:]) + delayed(_parallel_aeronet_call)(pd.DatetimeIndex([t1, t2]), **kwargs, freq=None) + for t1, t2 in zip(time_bounds[:-1], time_bounds[1:]) ) df = pd.concat(dfs, ignore_index=True).drop_duplicates() if freq is not None: @@ -461,7 +467,7 @@ def add_data( now = datetime.utcnow() self.dates = pd.date_range(start=now.date(), end=now, freq="H") else: - self.dates = dates + self.dates = pd.DatetimeIndex(dates) if product is not None: self.prod = product.upper() else: diff --git a/tests/test_aeronet.py b/tests/test_aeronet.py index 938f198f..1d69a3ac 100644 --- a/tests/test_aeronet.py +++ b/tests/test_aeronet.py @@ -222,8 +222,24 @@ def test_interp_daily_with_pytspack(): assert {f"aod_{int(wl)}nm" for wl in standard_wavelengths}.issubset(df.columns) -def test_issue100(): - dates = pd.date_range(start="2019-09-01", end="2019-09-2", freq="H") +@pytest.mark.parametrize( + "dates", + [ + pd.to_datetime(["2019-09-01", "2019-09-02"]), + pd.to_datetime(["2019-09-01", "2019-09-03"]), + pd.to_datetime(["2019-09-01", "2019-09-01 12:00"]), + ], + ids=[ + "one day", + "two days", + "half day", + ], +) +def test_issue100(dates, request): + if request.node.callspec.id == "two days": + pytest.xfail(reason="??") + df1 = aeronet.add_data(dates, n_procs=1) df2 = aeronet.add_data(dates, n_procs=2) - assert len(df1) == len(df2) + assert df1.equals(df2) + assert dates[0] < df1.time.min() < df1.time.max() < dates[-1] From 90c5b5d3fa2c50db3abb7b5aaa322967c0b58c7a Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 20 Sep 2023 16:41:27 -0600 Subject: [PATCH 4/8] Multi-day test now passes just have to sort first before comparing --- tests/test_aeronet.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_aeronet.py b/tests/test_aeronet.py index 1d69a3ac..39c2dfe9 100644 --- a/tests/test_aeronet.py +++ b/tests/test_aeronet.py @@ -236,10 +236,14 @@ def test_interp_daily_with_pytspack(): ], ) def test_issue100(dates, request): - if request.node.callspec.id == "two days": - pytest.xfail(reason="??") - df1 = aeronet.add_data(dates, n_procs=1) df2 = aeronet.add_data(dates, n_procs=2) - assert df1.equals(df2) + assert len(df1) == len(df2) + if request.node.callspec.id == "two days": + # Sort first (can use `df1.compare(df2)` for debugging) + df1_ = df1.sort_values(["time", "siteid"]).reset_index(drop=True) + df2_ = df2.sort_values(["time", "siteid"]).reset_index(drop=True) + assert df1_.equals(df2_) + else: + assert df1.equals(df2) assert dates[0] < df1.time.min() < df1.time.max() < dates[-1] From f7321b3c377c49a48a433cef5dbde8978692c4e3 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 20 Sep 2023 16:49:33 -0600 Subject: [PATCH 5/8] notes [skip ci] --- tests/test_aeronet.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_aeronet.py b/tests/test_aeronet.py index 39c2dfe9..7054ea45 100644 --- a/tests/test_aeronet.py +++ b/tests/test_aeronet.py @@ -241,6 +241,8 @@ def test_issue100(dates, request): assert len(df1) == len(df2) if request.node.callspec.id == "two days": # Sort first (can use `df1.compare(df2)` for debugging) + # Seems the sorting is site then time, not time then site + # which is why this is necessary df1_ = df1.sort_values(["time", "siteid"]).reset_index(drop=True) df2_ = df2.sort_values(["time", "siteid"]).reset_index(drop=True) assert df1_.equals(df2_) From ab44b37f1e16b87ec0edb28e5539024938981605 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 20 Sep 2023 17:32:07 -0600 Subject: [PATCH 6/8] -2 etc. also ok for joblib --- monetio/obs/aeronet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monetio/obs/aeronet.py b/monetio/obs/aeronet.py index a2024495..2569ef3c 100644 --- a/monetio/obs/aeronet.py +++ b/monetio/obs/aeronet.py @@ -146,7 +146,7 @@ def add_data( interp_to_aod_values=interp_to_aod_values, ) - requested_parallel = n_procs > 1 or n_procs == -1 + requested_parallel = n_procs > 1 or n_procs <= -1 # Split up by day dates = pd.to_datetime(dates) From 1a95aa280b8dedb5be60d3cdd7cfec56d4053882 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 20 Sep 2023 17:37:42 -0600 Subject: [PATCH 7/8] AQS xfail non strict --- tests/test_aqs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_aqs.py b/tests/test_aqs.py index b02a8793..21a9cae3 100644 --- a/tests/test_aqs.py +++ b/tests/test_aqs.py @@ -10,7 +10,7 @@ @pytest.mark.xfail( - not ssl_version < (2,), strict=True, reason="Doesn't work with newer OpenSSL", raises=SSLError + not ssl_version < (2,), strict=False, reason="Doesn't work with newer OpenSSL", raises=SSLError ) def test_aqs(): # For MM data proc example From 1c1a3dabe1fff9a8fc75d14d6b1b3cf0e24fbc9d Mon Sep 17 00:00:00 2001 From: Zachary Moon Date: Mon, 25 Sep 2023 13:12:21 -0400 Subject: [PATCH 8/8] Update monetio/obs/aeronet.py --- monetio/obs/aeronet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monetio/obs/aeronet.py b/monetio/obs/aeronet.py index 2569ef3c..00e0a2ed 100644 --- a/monetio/obs/aeronet.py +++ b/monetio/obs/aeronet.py @@ -146,7 +146,7 @@ def add_data( interp_to_aod_values=interp_to_aod_values, ) - requested_parallel = n_procs > 1 or n_procs <= -1 + requested_parallel = n_procs != 1 # Split up by day dates = pd.to_datetime(dates)