Adding Northeastern China Crop Map Dataset (microsoft#1666)

* Add files via upload Initial commit for adding Northeastern China Crop Map dataset * Added northeastern_china_cropmap (NCCM) definition to _init_.py * Update northeastern_china_cropmap.py * Added tests/data * added test_nccm.py * Updated datasets.rst and geo_datasets.csv * Latest changes to nccm.py * changes to data.py, nccm.py, test_nccm.py * Update test_nccm.py * Debug 1 * new changes * Latest update * Update torchgeo/datasets/nccm.py Co-authored-by: Yi-Chia Chang <61452667+yichiac@users.noreply.github.com> * Fixed style errors * Fixed style errors * Fixed style errors * Update docs/api/datasets.rst Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com> * Update torchgeo/datasets/nccm.py Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com> * Update torchgeo/datasets/nccm.py Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com> * Delete tests/data/nccm/.DS_Store * Update data.py * Update nccm.py * Update torchgeo/datasets/nccm.py Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com> * Update torchgeo/datasets/nccm.py Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com> * Update torchgeo/datasets/nccm.py Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com> * Update torchgeo/datasets/nccm.py Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com> * Update nccm.py * Update nccm.py * Resolved few comments * Fixed plotting functions, resolved comments * Fixed test cases * Fixed doc issue * Latest * Fixed doc * Latest * Latest * Latest * Update torchgeo/datasets/nccm.py Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com> * Latest changes: removed years * Update torchgeo/datasets/nccm.py Co-authored-by: Yi-Chia Chang <61452667+yichiac@users.noreply.github.com> * Update torchgeo/datasets/nccm.py Co-authored-by: Yi-Chia Chang <61452667+yichiac@users.noreply.github.com> * Update torchgeo/datasets/nccm.py Co-authored-by: Yi-Chia Chang <61452667+yichiac@users.noreply.github.com> * Update torchgeo/datasets/nccm.py Co-authored-by: Yi-Chia Chang <61452667+yichiac@users.noreply.github.com> * Update torchgeo/datasets/nccm.py Co-authored-by: Yi-Chia Chang <61452667+yichiac@users.noreply.github.com> * Update torchgeo/datasets/nccm.py Co-authored-by: Yi-Chia Chang <61452667+yichiac@users.noreply.github.com> * Removed unnecessary variables and fixed download path * Latest changes * Latest changes * Latest changes * Fixed spacing * Latest changes * Update torchgeo/datasets/nccm.py Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com> * Update docs/api/geo_datasets.csv Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com> * Update nccm.py * Update data.py * Update nccm.py * Update torchgeo/datasets/nccm.py Co-authored-by: Yi-Chia Chang <61452667+yichiac@users.noreply.github.com> * latest changes * Latest * Latest changes * Fixed torch.full() * removed print linke --------- Co-authored-by: shreya28 <“shreya28@illinois.edu”> Co-authored-by: Yi-Chia Chang <61452667+yichiac@users.noreply.github.com> Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com>
isaaccorley · Nov 17, 2023 · bc5cb4c · bc5cb4c
1 parent f1b751c
commit bc5cb4c
Show file tree

Hide file tree

Showing 10 changed files with 360 additions and 0 deletions.
diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
@@ -129,6 +129,11 @@ NAIP
 
 .. autoclass:: NAIP
 
+NCCM
+^^^^
+
+.. autoclass:: NCCM
+
 NLCD
 ^^^^
 

diff --git a/docs/api/geo_datasets.csv b/docs/api/geo_datasets.csv
@@ -18,5 +18,6 @@ Dataset,Type,Source,Size (px),Resolution (m)
 `Landsat`_,Imagery,Landsat,"8,900x8,900",30
 `NAIP`_,Imagery,Aerial,"6,100x7,600",1
 `NLCD`_,Masks,Landsat,-,30
+`NCCM`_,Masks,Sentinel-2,-,10 
 `Open Buildings`_,Geometries,"Maxar, CNES/Airbus",-,-
 `Sentinel`_,Imagery,Sentinel,"10,000x10,000",10
diff --git a/tests/data/nccm/13090442.zip b/tests/data/nccm/13090442.zip
diff --git a/tests/data/nccm/13090442/CDL2017_clip.tif b/tests/data/nccm/13090442/CDL2017_clip.tif
diff --git a/tests/data/nccm/13090442/CDL2018_clip1.tif b/tests/data/nccm/13090442/CDL2018_clip1.tif
diff --git a/tests/data/nccm/13090442/CDL2019_clip.tif b/tests/data/nccm/13090442/CDL2019_clip.tif
diff --git a/tests/data/nccm/data.py b/tests/data/nccm/data.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import hashlib
+import os
+import shutil
+
+import numpy as np
+import rasterio
+from rasterio.crs import CRS
+from rasterio.transform import Affine
+
+SIZE = 32
+
+np.random.seed(0)
+files = ["CDL2017_clip.tif", "CDL2018_clip1.tif", "CDL2019_clip.tif"]
+
+
+def create_file(path: str, dtype: str):
+    """Create the testing file."""
+    profile = {
+        "driver": "GTiff",
+        "dtype": dtype,
+        "count": 1,
+        "crs": CRS.from_epsg(4326),
+        "transform": Affine(
+            8.983152841195208e-05,
+            0.0,
+            115.483402043364,
+            0.0,
+            -8.983152841195208e-05,
+            53.531397320113605,
+        ),
+        "height": SIZE,
+        "width": SIZE,
+        "compress": "lzw",
+        "predictor": 2,
+    }
+
+    allowed_values = [0, 1, 2, 3, 15]
+
+    Z = np.random.choice(allowed_values, size=(SIZE, SIZE))
+
+    with rasterio.open(path, "w", **profile) as src:
+        src.write(Z, 1)
+
+
+if __name__ == "__main__":
+    dir = os.path.join(os.getcwd(), "13090442")
+
+    if os.path.exists(dir) and os.path.isdir(dir):
+        shutil.rmtree(dir)
+
+    os.makedirs(dir, exist_ok=True)
+
+    for file in files:
+        create_file(os.path.join(dir, file), dtype="int8")
+
+    # Compress data
+    shutil.make_archive("13090442", "zip", ".", dir)
+
+    # Compute checksums
+    with open("13090442.zip", "rb") as f:
+        md5 = hashlib.md5(f.read()).hexdigest()
+        print(f"13090442.zip: {md5}")
diff --git a/tests/datasets/test_nccm.py b/tests/datasets/test_nccm.py
@@ -0,0 +1,79 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import shutil
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import pytest
+import torch
+import torch.nn as nn
+from pytest import MonkeyPatch
+from rasterio.crs import CRS
+
+import torchgeo.datasets.utils
+from torchgeo.datasets import NCCM, BoundingBox, IntersectionDataset, UnionDataset
+from torchgeo.datasets.utils import DatasetNotFoundError
+
+
+def download_url(url: str, root: str, *args: str, **kwargs: str) -> None:
+    shutil.copy(url, root)
+
+
+class TestNCCM:
+    @pytest.fixture
+    def dataset(self, monkeypatch: MonkeyPatch, tmp_path: Path) -> NCCM:
+        monkeypatch.setattr(torchgeo.datasets.nccm, "download_url", download_url)
+        url = os.path.join("tests", "data", "nccm", "13090442.zip")
+        transforms = nn.Identity()
+        monkeypatch.setattr(NCCM, "url", url)
+        root = str(tmp_path)
+        return NCCM(root, transforms=transforms, download=True, checksum=True)
+
+    def test_getitem(self, dataset: NCCM) -> None:
+        x = dataset[dataset.bounds]
+        assert isinstance(x, dict)
+        assert isinstance(x["crs"], CRS)
+        assert isinstance(x["mask"], torch.Tensor)
+
+    def test_and(self, dataset: NCCM) -> None:
+        ds = dataset & dataset
+        assert isinstance(ds, IntersectionDataset)
+
+    def test_or(self, dataset: NCCM) -> None:
+        ds = dataset | dataset
+        assert isinstance(ds, UnionDataset)
+
+    def test_already_extracted(self, dataset: NCCM) -> None:
+        NCCM(dataset.paths, download=True)
+
+    def test_already_downloaded(self, tmp_path: Path) -> None:
+        pathname = os.path.join("tests", "data", "nccm", "13090442.zip")
+        root = str(tmp_path)
+        shutil.copy(pathname, root)
+        NCCM(root)
+
+    def test_plot(self, dataset: NCCM) -> None:
+        query = dataset.bounds
+        x = dataset[query]
+        dataset.plot(x, suptitle="Test")
+        plt.close()
+
+    def test_plot_prediction(self, dataset: NCCM) -> None:
+        query = dataset.bounds
+        x = dataset[query]
+        x["prediction"] = x["mask"].clone()
+        dataset.plot(x, suptitle="Prediction")
+        plt.close()
+
+    def test_not_downloaded(self, tmp_path: Path) -> None:
+        with pytest.raises(DatasetNotFoundError, match="Dataset not found"):
+            NCCM(str(tmp_path))
+
+    def test_invalid_query(self, dataset: NCCM) -> None:
+        query = BoundingBox(0, 0, 0, 0, 0, 0)
+        with pytest.raises(
+            IndexError, match="query: .* not found in index with bounds:"
+        ):
+            dataset[query]
diff --git a/torchgeo/datasets/__init__.py b/torchgeo/datasets/__init__.py
@@ -77,6 +77,7 @@
 from .millionaid import MillionAID
 from .naip import NAIP
 from .nasa_marine_debris import NASAMarineDebris
+from .nccm import NCCM
 from .nlcd import NLCD
 from .openbuildings import OpenBuildings
 from .oscd import OSCD
@@ -168,6 +169,7 @@
     "Landsat8",
     "Landsat9",
     "NAIP",
+    "NCCM",
     "NLCD",
     "OpenBuildings",
     "Sentinel",

diff --git a/torchgeo/datasets/nccm.py b/torchgeo/datasets/nccm.py
@@ -0,0 +1,206 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Northeastern China Crop Map Dataset."""
+
+import glob
+import os
+from collections.abc import Iterable
+from typing import Any, Callable, Optional, Union
+
+import matplotlib.pyplot as plt
+import torch
+from matplotlib.figure import Figure
+from rasterio.crs import CRS
+
+from .geo import RasterDataset
+from .utils import BoundingBox, DatasetNotFoundError, download_url, extract_archive
+
+
+class NCCM(RasterDataset):
+    """The Northeastern China Crop Map Dataset.
+
+    Link: https://www.nature.com/articles/s41597-021-00827-9
+
+    This dataset produced annual 10-m crop maps of the
+    major crops (maize, soybean, and rice)
+    in Northeast China from 2017 to 2019, using hierarchial mapping strategies,
+    random forest classifiers, interpolated and
+    smoothed 10-day Sentinel-2 time series data and
+    optimized features from spectral, temporal and
+    textural characteristics of the land surface.
+    The resultant maps have high overall accuracies (OA)
+    based on ground truth data. The dataset contains information
+    specific to three years: 2017, 2018, 2019.
+
+    The dataset contains 5 classes:
+
+    0. paddy rice
+    1. maize
+    2. soybean
+    3. others crops and lands
+    4. nodata
+
+    Dataset format:
+
+    * Three .TIF files containing the labels
+    * JavaScript code to download images from the dataset.
+
+    If you use this dataset in your research, please cite the following paper:
+
+    * https://doi.org/10.1038/s41597-021-00827-9
+
+    .. versionadded:: 0.6
+    """
+
+    filename_regex = r"CDL(?P<year>\d{4})_clip"
+    filename_glob = "CDL*.*"
+    zipfile_glob = "13090442.zip"
+
+    date_format = "%Y"
+    is_image = False
+    url = "https://figshare.com/ndownloader/articles/13090442/versions/1"
+    md5 = "eae952f1b346d7e649d027e8139a76f5"
+
+    cmap = {
+        0: (0, 255, 0, 255),
+        1: (255, 0, 0, 255),
+        2: (255, 255, 0, 255),
+        3: (128, 128, 128, 255),
+        15: (255, 255, 255, 255),
+    }
+
+    def __init__(
+        self,
+        paths: Union[str, Iterable[str]] = "data",
+        crs: Optional[CRS] = None,
+        res: Optional[float] = None,
+        transforms: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None,
+        cache: bool = True,
+        download: bool = False,
+        checksum: bool = False,
+    ) -> None:
+        """Initialize a new dataset.
+
+        Args:
+            paths: one or more root directories to search or files to load
+            crs: :term:`coordinate reference system (CRS)` to warp to
+                (defaults to the CRS of the first file found)
+            res: resolution of the dataset in units of CRS
+                (defaults to the resolution of the first file found)
+            transforms: a function/transform that takes an input sample
+                and returns a transformed version
+            cache: if True, cache file handle to speed up repeated sampling
+            download: if True, download dataset and store it in the root directory
+            checksum: if True, check the MD5 after downloading files (may be slow)
+
+        Raises:
+            DatasetNotFoundError: If dataset is not found and *download* is False.
+        """
+        self.paths = paths
+        self.download = download
+        self.checksum = checksum
+        self.ordinal_map = torch.full((max(self.cmap.keys()) + 1,), 4, dtype=self.dtype)
+        self.ordinal_cmap = torch.zeros((5, 4), dtype=torch.uint8)
+
+        self._verify()
+        super().__init__(paths, crs, res, transforms=transforms, cache=cache)
+
+        for i, (k, v) in enumerate(self.cmap.items()):
+            self.ordinal_map[k] = i
+            self.ordinal_cmap[i] = torch.tensor(v)
+
+    def __getitem__(self, query: BoundingBox) -> dict[str, Any]:
+        """Retrieve mask and metadata indexed by query.
+
+        Args:
+            query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index
+
+        Returns:
+            sample of mask and metadata at that index
+
+        Raises:
+            IndexError: if query is not found in the index
+        """
+        sample = super().__getitem__(query)
+        sample["mask"] = self.ordinal_map[sample["mask"]]
+        return sample
+
+    def _verify(self) -> None:
+        """Verify the integrity of the dataset."""
+        # Check if the extracted files already exist
+        if self.files:
+            return
+
+        # Check if the zip file has already been downloaded
+        assert isinstance(self.paths, str)
+        pathname = os.path.join(self.paths, "**", self.zipfile_glob)
+        if glob.glob(pathname, recursive=True):
+            self._extract()
+            return
+
+        # Check if the user requested to download the dataset
+        if not self.download:
+            raise DatasetNotFoundError(self)
+
+        # Download the dataset
+        self._download()
+        self._extract()
+
+    def _download(self) -> None:
+        """Download the dataset."""
+        filename = "13090442.zip"
+        download_url(
+            self.url, self.paths, filename, md5=self.md5 if self.checksum else None
+        )
+
+    def _extract(self) -> None:
+        """Extract the dataset."""
+        assert isinstance(self.paths, str)
+        pathname = os.path.join(self.paths, "**", self.zipfile_glob)
+        extract_archive(glob.glob(pathname, recursive=True)[0], self.paths)
+
+    def plot(
+        self,
+        sample: dict[str, Any],
+        show_titles: bool = True,
+        suptitle: Optional[str] = None,
+    ) -> Figure:
+        """Plot a sample from the dataset.
+
+        Args:
+            sample: a sample returned by :meth:`NCCM.__getitem__`
+            show_titles: flag indicating whether to show titles above each panel
+            suptitle: optional string to use as a suptitle
+
+        Returns:
+            a matplotlib Figure with the rendered sample
+        """
+        mask = sample["mask"].squeeze()
+        ncols = 1
+
+        showing_predictions = "prediction" in sample
+        if showing_predictions:
+            pred = sample["prediction"].squeeze()
+            ncols = 2
+
+        fig, axs = plt.subplots(
+            nrows=1, ncols=ncols, figsize=(ncols * 4, 4), squeeze=False
+        )
+
+        axs[0, 0].imshow(self.ordinal_cmap[mask], interpolation="none")
+        axs[0, 0].axis("off")
+
+        if show_titles:
+            axs[0, 0].set_title("Mask")
+
+        if showing_predictions:
+            axs[0, 1].imshow(self.ordinal_cmap[pred], interpolation="none")
+            axs[0, 1].axis("off")
+            if show_titles:
+                axs[0, 1].set_title("Prediction")
+
+        if suptitle is not None:
+            plt.suptitle(suptitle)
+
+        return fig
-Original file line number
+Diff line change
@@ Expand Up / @@ -129,6 +129,11 @@ NAIP @@
     .. autoclass:: NAIP
+    NCCM
+    ^^^^
+    .. autoclass:: NCCM
     NLCD
     ^^^^
@@ Expand Down @@