microsoft · adamjstewart · Feb 26, 2022 · Feb 11, 2022 · Feb 11, 2022 · Feb 12, 2022
diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
@@ -47,6 +47,10 @@ Esri2020
 
 .. autoclass:: Esri2020
 
+GlobBiomass
+^^^^^^^^^^^
+.. autoclass:: GlobBiomass
+
 Landsat
 ^^^^^^^
 

diff --git a/tests/data/globbiomass/N00E020_agb.tif b/tests/data/globbiomass/N00E020_agb.tif
diff --git a/tests/data/globbiomass/N00E020_agb.zip b/tests/data/globbiomass/N00E020_agb.zip
diff --git a/tests/data/globbiomass/N00E020_agb_err.tif b/tests/data/globbiomass/N00E020_agb_err.tif
diff --git a/tests/data/globbiomass/data.py b/tests/data/globbiomass/data.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import hashlib
+import os
+import random
+import zipfile
+
+import numpy as np
+import rasterio
+
+np.random.seed(0)
+random.seed(0)
+
+SIZE = 64
+
+
+files = [{"image": "N00E020_agb.tif"}, {"image": "N00E020_agb_err.tif"}]
+
+
+def create_file(path: str, dtype: str, num_channels: int) -> None:
+    profile = {}
+    profile["driver"] = "GTiff"
+    profile["dtype"] = dtype
+    profile["count"] = num_channels
+    profile["crs"] = "epsg:4326"
+    profile["transform"] = rasterio.transform.from_bounds(0, 0, 1, 1, 1, 1)
+    profile["height"] = SIZE
+    profile["width"] = SIZE
+    profile["compress"] = "lzw"
+    profile["predictor"] = 2
+
+    Z = np.random.randint(
+        np.iinfo(profile["dtype"]).max, size=(1, SIZE, SIZE), dtype=profile["dtype"]
+    )
+    src = rasterio.open(path, "w", **profile)
+    src.write(Z)
+
+
+if __name__ == "__main__":
+    zipfilename = "N00E020_agb.zip"
+    files_to_zip = []
+
+    for file_dict in files:
+        path = file_dict["image"]
+        # remove old data
+        if os.path.exists(path):
+            os.remove(path)
+        # Create mask file
+        create_file(path, dtype="int32", num_channels=1)
+        files_to_zip.append(path)
+
+    # Compress data
+    with zipfile.ZipFile(zipfilename, "w") as zip:
+        for file in files_to_zip:
+            zip.write(file, arcname=file)
+
+    # Compute checksums
+    with open(zipfilename, "rb") as f:
+        md5 = hashlib.md5(f.read()).hexdigest()
+        print(f"{zipfilename}: {md5}")
diff --git a/tests/datasets/test_globbiomass.py b/tests/datasets/test_globbiomass.py
@@ -0,0 +1,83 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import shutil
+from pathlib import Path
+from typing import Generator
+
+import pytest
+import torch
+import torch.nn as nn
+from _pytest.monkeypatch import MonkeyPatch
+from rasterio.crs import CRS
+
+from torchgeo.datasets import (
+    BoundingBox,
+    GlobBiomass,
+    IntersectionDataset,
+    UnionDataset,
+)
+
+
+class TestGlobBiomass:
+    @pytest.fixture
+    def dataset(
+        self, monkeypatch: Generator[MonkeyPatch, None, None], tmp_path: Path
+    ) -> GlobBiomass:
+        zipfile = "N00E020_agb.zip"
+
+        shutil.copy(os.path.join("tests", "data", "globbiomass", zipfile), tmp_path)
+
+        md5s = {zipfile: "7b7b981149aa31a099f453fef32b644f"}
+
+        monkeypatch.setattr(GlobBiomass, "md5s", md5s)  # type: ignore[attr-defined]
+        root = str(tmp_path)
+        transforms = nn.Identity()  # type: ignore[attr-defined]
+        return GlobBiomass(root, transforms=transforms, checksum=True)
+
+    def test_getitem(self, dataset: GlobBiomass) -> None:
+        x = dataset[dataset.bounds]
+        assert isinstance(x, dict)
+        assert isinstance(x["crs"], CRS)
+        assert isinstance(x["mask"], torch.Tensor)
+        assert isinstance(x["error_mask"], torch.Tensor)
+
+    def test_already_extracted(self, dataset: GlobBiomass) -> None:
+        GlobBiomass(root=dataset.root)
+
+    def test_not_downloaded(self, tmp_path: Path) -> None:
+        with pytest.raises(RuntimeError, match="Dataset not found"):
+            GlobBiomass(str(tmp_path), checksum=True)
+
+    def test_corrupted(self, tmp_path: Path) -> None:
+        with open(os.path.join(tmp_path, "N00E020_agb.zip"), "w") as f:
+            f.write("bad")
+        with pytest.raises(RuntimeError, match="Dataset found, but corrupted."):
+            GlobBiomass(root=str(tmp_path), checksum=True)
+
+    def test_and(self, dataset: GlobBiomass) -> None:
+        ds = dataset & dataset
+        assert isinstance(ds, IntersectionDataset)
+
+    def test_or(self, dataset: GlobBiomass) -> None:
+        ds = dataset | dataset
+        assert isinstance(ds, UnionDataset)
+
+    def test_plot(self, dataset: GlobBiomass) -> None:
+        query = dataset.bounds
+        x = dataset[query]
+        dataset.plot(x, suptitle="Test")
+
+    def test_plot_prediction(self, dataset: GlobBiomass) -> None:
+        query = dataset.bounds
+        x = dataset[query]
+        x["prediction"] = x["mask"].clone()
+        dataset.plot(x, suptitle="Prediction")
+
+    def test_invalid_query(self, dataset: GlobBiomass) -> None:
+        query = BoundingBox(100, 100, 100, 100, 0, 0)
+        with pytest.raises(
+            IndexError, match="query: .* not found in index with bounds:"
+        ):
+            dataset[query]
diff --git a/torchgeo/datasets/__init__.py b/torchgeo/datasets/__init__.py
@@ -40,6 +40,7 @@
     VisionDataset,
 )
 from .gid15 import GID15
+from .globbiomass import GlobBiomass
 from .idtrees import IDTReeS
 from .inria import InriaAerialImageLabeling
 from .landcoverai import LandCoverAI
@@ -98,6 +99,7 @@
     "ChesapeakeWV",
     "ChesapeakeCVPR",
     "Esri2020",
+    "GlobBiomass",
     "Landsat",
     "Landsat1",
     "Landsat2",