Move download code out of commands.py (stactools-packages#22)

* specify int click datatype for years * set first/last available years as constants * move download code out of commands.py * fix confidence layer url format * add previous frequency and cultivated layers add URL constants * update changelog * Update CHANGELOG.md Co-authored-by: Pete Gadomski <pete.gadomski@gmail.com> --------- Co-authored-by: Pete Gadomski <pete.gadomski@gmail.com>
ncx-co · Apr 10, 2023 · 2b5806b · 2b5806b
1 parent 9b4e073
commit 2b5806b
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 50 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,9 +6,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
+### Added
+
+-`download_zips` function for downloading raw data from USDA ([#22](https://github.com/stactools-packages/usda-cdl/pull/22))
+- Download functionality for 2022 files ([#22](https://github.com/stactools-packages/usda-cdl/pull/22))
+
+### Fixed
+
+- CLI download utility can handle specific years ([#20](https://github.com/stactools-packages/usda-cdl/pull/22))
+
 ## [0.1.3] - 2023-02-28
 
-## Added
+### Added
 
 - A license link ([#18](https://github.com/stactools-packages/usda-cdl/pull/18))
 

diff --git a/src/stactools/usda_cdl/commands.py b/src/stactools/usda_cdl/commands.py
@@ -4,11 +4,10 @@
 from typing import List
 
 import click
-import requests
 from click import Command, Group, Path
-from tqdm import tqdm
 
 from stactools.usda_cdl import stac, tile
+from stactools.usda_cdl.download import download_zips
 from stactools.usda_cdl.tile import DEFAULT_WINDOW_SIZE
 
 logger = logging.getLogger(__name__)
@@ -81,7 +80,7 @@ def tile_file(infile: Path, destination: Path, size: int) -> None:
             tile.tile_geotiff(infile_as_path, pathlib.Path(str(destination)), size)
 
     @usda_cdl.command("download", short_help="Download zipped source GeoTIFFs")
-    @click.argument("years", nargs=-1)
+    @click.argument("years", nargs=-1, type=int)
     @click.argument("destination", nargs=1)
     def download(years: List[int], destination: Path) -> None:
         """Downloads the USDA CDL zip files to the destination directory. It's a
@@ -90,51 +89,6 @@ def download(years: List[int], destination: Path) -> None:
         If you just want to download specific years' data, provide those years
         on the command line before the destination directory.
         """
-        os.makedirs(str(destination), exist_ok=True)
-        if not years:
-            years = list(range(2008, 2022))
-        urls = list()
-        for year in years:
-            if year < 2008 or year > 2021:
-                raise Exception(f"Unsupported CDL year: {year}")
-            urls.append(
-                "https://www.nass.usda.gov/Research_and_Science/Cropland"
-                f"/Release/datasets/{year}_30m_cdls.zip"
-            )
-            if year >= 2017:
-                if year == 2021:
-                    urls.append(
-                        "https://www.nass.usda.gov/Research_and_Science/Cropland"
-                        f"/Release/datasets/{year}_30m_Confidence_Layer.zip"
-                    )
-                else:
-                    urls.append(
-                        "https://www.nass.usda.gov/Research_and_Science/Cropland"
-                        f"/Release/datasets/{year}_30m_confidence_layer.zip"
-                    )
-            if year == 2021:
-                urls.append(
-                    "https://www.nass.usda.gov/Research_and_Science/Cropland"
-                    "/Release/datasets/2021_Cultivated_Layer.zip"
-                )
-                urls.append(
-                    "https://www.nass.usda.gov/Research_and_Science/Cropland"
-                    "/Release/datasets/Crop_Frequency_2008-2021.zip"
-                )
-        for url in urls:
-            path = pathlib.Path(str(destination)) / os.path.basename(url)
-            if path.exists():
-                print(f"{path} already exists, skipping...")
-                continue
-            response = requests.get(url, stream=True)
-            with tqdm.wrapattr(
-                open(path, "wb"),
-                "write",
-                miniters=1,
-                desc=url.split("/")[-1],
-                total=int(response.headers.get("content-length", 0)),
-            ) as fout:
-                for chunk in response.iter_content(chunk_size=4096):
-                    fout.write(chunk)
+        download_zips(years, pathlib.Path(str(destination)))
 
     return usda_cdl
diff --git a/src/stactools/usda_cdl/constants.py b/src/stactools/usda_cdl/constants.py
@@ -325,3 +325,9 @@ def is_frequency(self) -> bool:
         ]
     ),
 )
+
+# most recently available year for download
+MOST_RECENT_YEAR = 2022
+
+# first available year for download
+FIRST_AVAILABLE_YEAR = 2008
diff --git a/src/stactools/usda_cdl/download.py b/src/stactools/usda_cdl/download.py
@@ -0,0 +1,74 @@
+import os
+import pathlib
+from typing import List
+
+import requests
+from tqdm import tqdm
+
+from stactools.usda_cdl.constants import FIRST_AVAILABLE_YEAR, MOST_RECENT_YEAR
+
+URL_BASE = "https://www.nass.usda.gov/Research_and_Science/Cropland/Release/datasets/"
+
+CROPLAND_URL = URL_BASE + "{year}_30m_cdls.zip"
+CONFIDENCE_URL = URL_BASE + "{year}_30m_confidence_layer.zip"
+FREQUENCY_URL = URL_BASE + "Crop_Frequency_{first_year}-{last_year}.zip"
+CULTIVATED_URL = URL_BASE + "{year}_Cultivated_Layer.zip"
+
+
+def download_zips(years: List[int], destination: pathlib.Path) -> List[pathlib.Path]:
+    """Download zipped GeoTiffs from USDA
+
+    Args:
+        years: list of years to download
+        destination: destination directory for downloaded files
+
+    Returns: list of filepaths for downloaded zip files
+    """
+    os.makedirs(str(destination), exist_ok=True)
+    if not years:
+        years = list(range(FIRST_AVAILABLE_YEAR, MOST_RECENT_YEAR + 1))
+    urls = list()
+    for year in years:
+        if year < FIRST_AVAILABLE_YEAR or year > MOST_RECENT_YEAR:
+            raise Exception(f"Unsupported CDL year: {year}")
+        urls.append(CROPLAND_URL.format(year=year))
+
+        # in 2017 and beyond there is a confidence layer available
+        if year >= 2017:
+            confidence_url = CONFIDENCE_URL.format(year=year)
+            # in 2021 they changed the file basename slightly ¯\_(ツ)_/¯
+            if year >= 2021:
+                confidence_url = confidence_url.replace(
+                    "confidence_layer", "Confidence_Layer"
+                )
+
+            urls.append(confidence_url)
+
+        # starting in 2020, the "Cultivated" and cumalative (2008-present)
+        # "Crop Frequency" layers are available
+        if year >= 2020:
+            urls.append(CULTIVATED_URL.format(year=year))
+            urls.append(
+                FREQUENCY_URL.format(first_year=FIRST_AVAILABLE_YEAR, last_year=year)
+            )
+
+    zips = []
+    for url in urls:
+        path = pathlib.Path(str(destination)) / os.path.basename(url)
+        if path.exists():
+            print(f"{path} already exists, skipping...")
+            continue
+        response = requests.get(url, stream=True)
+        with tqdm.wrapattr(
+            open(path, "wb"),
+            "write",
+            miniters=1,
+            desc=url.split("/")[-1],
+            total=int(response.headers.get("content-length", 0)),
+        ) as fout:
+            for chunk in response.iter_content(chunk_size=4096):
+                fout.write(chunk)
+
+        zips.append(path)
+
+    return zips