Skip to content

Commit

Permalink
Move download code out of commands.py (stactools-packages#22)
Browse files Browse the repository at this point in the history
* specify int click datatype for years

* set first/last available years as constants

* move download code out of commands.py

* fix confidence layer url format

* add previous frequency and cultivated layers

add URL constants

* update changelog

* Update CHANGELOG.md

Co-authored-by: Pete Gadomski <pete.gadomski@gmail.com>

---------

Co-authored-by: Pete Gadomski <pete.gadomski@gmail.com>
  • Loading branch information
Henry Rodman and gadomski authored Apr 10, 2023
1 parent 9b4e073 commit 2b5806b
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 50 deletions.
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [Unreleased]

### Added

-`download_zips` function for downloading raw data from USDA ([#22](https://github.com/stactools-packages/usda-cdl/pull/22))
- Download functionality for 2022 files ([#22](https://github.com/stactools-packages/usda-cdl/pull/22))

### Fixed

- CLI download utility can handle specific years ([#20](https://github.com/stactools-packages/usda-cdl/pull/22))

## [0.1.3] - 2023-02-28

## Added
### Added

- A license link ([#18](https://github.com/stactools-packages/usda-cdl/pull/18))

Expand Down
52 changes: 3 additions & 49 deletions src/stactools/usda_cdl/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@
from typing import List

import click
import requests
from click import Command, Group, Path
from tqdm import tqdm

from stactools.usda_cdl import stac, tile
from stactools.usda_cdl.download import download_zips
from stactools.usda_cdl.tile import DEFAULT_WINDOW_SIZE

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -81,7 +80,7 @@ def tile_file(infile: Path, destination: Path, size: int) -> None:
tile.tile_geotiff(infile_as_path, pathlib.Path(str(destination)), size)

@usda_cdl.command("download", short_help="Download zipped source GeoTIFFs")
@click.argument("years", nargs=-1)
@click.argument("years", nargs=-1, type=int)
@click.argument("destination", nargs=1)
def download(years: List[int], destination: Path) -> None:
"""Downloads the USDA CDL zip files to the destination directory. It's a
Expand All @@ -90,51 +89,6 @@ def download(years: List[int], destination: Path) -> None:
If you just want to download specific years' data, provide those years
on the command line before the destination directory.
"""
os.makedirs(str(destination), exist_ok=True)
if not years:
years = list(range(2008, 2022))
urls = list()
for year in years:
if year < 2008 or year > 2021:
raise Exception(f"Unsupported CDL year: {year}")
urls.append(
"https://www.nass.usda.gov/Research_and_Science/Cropland"
f"/Release/datasets/{year}_30m_cdls.zip"
)
if year >= 2017:
if year == 2021:
urls.append(
"https://www.nass.usda.gov/Research_and_Science/Cropland"
f"/Release/datasets/{year}_30m_Confidence_Layer.zip"
)
else:
urls.append(
"https://www.nass.usda.gov/Research_and_Science/Cropland"
f"/Release/datasets/{year}_30m_confidence_layer.zip"
)
if year == 2021:
urls.append(
"https://www.nass.usda.gov/Research_and_Science/Cropland"
"/Release/datasets/2021_Cultivated_Layer.zip"
)
urls.append(
"https://www.nass.usda.gov/Research_and_Science/Cropland"
"/Release/datasets/Crop_Frequency_2008-2021.zip"
)
for url in urls:
path = pathlib.Path(str(destination)) / os.path.basename(url)
if path.exists():
print(f"{path} already exists, skipping...")
continue
response = requests.get(url, stream=True)
with tqdm.wrapattr(
open(path, "wb"),
"write",
miniters=1,
desc=url.split("/")[-1],
total=int(response.headers.get("content-length", 0)),
) as fout:
for chunk in response.iter_content(chunk_size=4096):
fout.write(chunk)
download_zips(years, pathlib.Path(str(destination)))

return usda_cdl
6 changes: 6 additions & 0 deletions src/stactools/usda_cdl/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,3 +325,9 @@ def is_frequency(self) -> bool:
]
),
)

# most recently available year for download
MOST_RECENT_YEAR = 2022

# first available year for download
FIRST_AVAILABLE_YEAR = 2008
74 changes: 74 additions & 0 deletions src/stactools/usda_cdl/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os
import pathlib
from typing import List

import requests
from tqdm import tqdm

from stactools.usda_cdl.constants import FIRST_AVAILABLE_YEAR, MOST_RECENT_YEAR

URL_BASE = "https://www.nass.usda.gov/Research_and_Science/Cropland/Release/datasets/"

CROPLAND_URL = URL_BASE + "{year}_30m_cdls.zip"
CONFIDENCE_URL = URL_BASE + "{year}_30m_confidence_layer.zip"
FREQUENCY_URL = URL_BASE + "Crop_Frequency_{first_year}-{last_year}.zip"
CULTIVATED_URL = URL_BASE + "{year}_Cultivated_Layer.zip"


def download_zips(years: List[int], destination: pathlib.Path) -> List[pathlib.Path]:
"""Download zipped GeoTiffs from USDA
Args:
years: list of years to download
destination: destination directory for downloaded files
Returns: list of filepaths for downloaded zip files
"""
os.makedirs(str(destination), exist_ok=True)
if not years:
years = list(range(FIRST_AVAILABLE_YEAR, MOST_RECENT_YEAR + 1))
urls = list()
for year in years:
if year < FIRST_AVAILABLE_YEAR or year > MOST_RECENT_YEAR:
raise Exception(f"Unsupported CDL year: {year}")
urls.append(CROPLAND_URL.format(year=year))

# in 2017 and beyond there is a confidence layer available
if year >= 2017:
confidence_url = CONFIDENCE_URL.format(year=year)
# in 2021 they changed the file basename slightly ¯\_(ツ)_/¯
if year >= 2021:
confidence_url = confidence_url.replace(
"confidence_layer", "Confidence_Layer"
)

urls.append(confidence_url)

# starting in 2020, the "Cultivated" and cumalative (2008-present)
# "Crop Frequency" layers are available
if year >= 2020:
urls.append(CULTIVATED_URL.format(year=year))
urls.append(
FREQUENCY_URL.format(first_year=FIRST_AVAILABLE_YEAR, last_year=year)
)

zips = []
for url in urls:
path = pathlib.Path(str(destination)) / os.path.basename(url)
if path.exists():
print(f"{path} already exists, skipping...")
continue
response = requests.get(url, stream=True)
with tqdm.wrapattr(
open(path, "wb"),
"write",
miniters=1,
desc=url.split("/")[-1],
total=int(response.headers.get("content-length", 0)),
) as fout:
for chunk in response.iter_content(chunk_size=4096):
fout.write(chunk)

zips.append(path)

return zips

0 comments on commit 2b5806b

Please sign in to comment.