diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
index dfc63e08bc7..e5dd579b96d 100644
--- a/docs/api/datasets.rst
+++ b/docs/api/datasets.rst
@@ -77,6 +77,11 @@ GlobBiomass
.. autoclass:: GlobBiomass
+iNaturalist
+^^^^^^^^^^^
+
+.. autoclass:: INaturalist
+
Landsat
^^^^^^^
diff --git a/tests/data/inaturalist/data.py b/tests/data/inaturalist/data.py
new file mode 100755
index 00000000000..6bfbc685008
--- /dev/null
+++ b/tests/data/inaturalist/data.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import pandas as pd
+
+filename = "observations-012345.csv"
+
+# User can select which columns to export. The following are the default columns.
+# Not all columns may exist in the actual dataset.
+size = 4
+data = {
+ "id": [""] * size,
+ "observed_on_string": [""] * size,
+ "observed_on": ["", "", "2022-05-07", "2022-05-07"],
+ "time_observed_at": ["", "", "", "2022-05-07 11:02:53 +0100"],
+ "time_zone": ["Central Time (US & Canada)"] * size,
+ "user_id": [123] * size,
+ "user_login": ["darwin"] * size,
+ "created_at": ["2022-05-07 11:02:53 +0100"] * size,
+ "updated_at": ["2022-05-07 11:02:53 +0100"] * size,
+ "quality_grade": ["research"] * size,
+ "license": ["CCO"] * size,
+ "url": ["https://inaturalist.org/observations/123"] * size,
+ "image_url": [
+ "https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg"
+ ]
+ * size,
+ "sound_url": ["https://static.inaturalist.org/sounds/123.m4a?123"] * size,
+ "tag_list": ["Chicago"] * size,
+ "description": [""] * size,
+ "num_identification_agreements": [1] * size,
+ "num_identification_disagreements": [0] * size,
+ "captive_cultivated": ["false"] * size,
+ "oauth_application_id": [""] * size,
+ "place_guess": ["Chicago"] * size,
+ "latitude": [41.881832] * size,
+ "longitude": [""] + [-87.623177] * (size - 1),
+ "positional_accuracy": [5] * size,
+ "private_place_guess": [""] * size,
+ "private_latitude": [""] * size,
+ "private_longitude": [""] * size,
+ "public_positional_accuracy": [5] * size,
+ "geoprivacy": [""] * size,
+ "taxon_geoprivacy": [""] * size,
+ "coordinates_obscured": ["false"] * size,
+ "positioning_method": ["gps"] * size,
+ "positioning_device": ["gps"] * size,
+ "species_guess": ["Homo sapiens"] * size,
+ "scientific_name": ["Homo sapiens"] * size,
+ "common_name": ["human"] * size,
+ "iconic_taxon_name": ["Animalia"] * size,
+ "taxon_id": [123] * size,
+}
+
+df = pd.DataFrame(data)
+df.to_csv(filename, index=False)
diff --git a/tests/data/inaturalist/observations-012345.csv b/tests/data/inaturalist/observations-012345.csv
new file mode 100644
index 00000000000..dc340cbe0a3
--- /dev/null
+++ b/tests/data/inaturalist/observations-012345.csv
@@ -0,0 +1,5 @@
+id,observed_on_string,observed_on,time_observed_at,time_zone,user_id,user_login,created_at,updated_at,quality_grade,license,url,image_url,sound_url,tag_list,description,num_identification_agreements,num_identification_disagreements,captive_cultivated,oauth_application_id,place_guess,latitude,longitude,positional_accuracy,private_place_guess,private_latitude,private_longitude,public_positional_accuracy,geoprivacy,taxon_geoprivacy,coordinates_obscured,positioning_method,positioning_device,species_guess,scientific_name,common_name,iconic_taxon_name,taxon_id
+,,,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123
+,,,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123
+,,2022-05-07,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123
+,,2022-05-07,2022-05-07 11:02:53 +0100,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123
diff --git a/tests/datasets/test_inaturalist.py b/tests/datasets/test_inaturalist.py
new file mode 100644
index 00000000000..623c64837bd
--- /dev/null
+++ b/tests/datasets/test_inaturalist.py
@@ -0,0 +1,72 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import builtins
+import os
+from pathlib import Path
+from typing import Any
+
+import pytest
+from _pytest.monkeypatch import MonkeyPatch
+
+from torchgeo.datasets import (
+ BoundingBox,
+ INaturalist,
+ IntersectionDataset,
+ UnionDataset,
+)
+
+pytest.importorskip("pandas", minversion="0.23.2")
+
+
+class TestINaturalist:
+ @pytest.fixture(scope="class")
+ def dataset(self) -> INaturalist:
+ root = os.path.join("tests", "data", "inaturalist")
+ return INaturalist(root)
+
+ def test_getitem(self, dataset: INaturalist) -> None:
+ x = dataset[dataset.bounds]
+ assert isinstance(x, dict)
+
+ def test_len(self, dataset: INaturalist) -> None:
+ assert len(dataset) == 3
+
+ def test_and(self, dataset: INaturalist) -> None:
+ ds = dataset & dataset
+ assert isinstance(ds, IntersectionDataset)
+
+ def test_or(self, dataset: INaturalist) -> None:
+ ds = dataset | dataset
+ assert isinstance(ds, UnionDataset)
+
+ def test_no_data(self, tmp_path: Path) -> None:
+ with pytest.raises(FileNotFoundError, match="Dataset not found"):
+ INaturalist(str(tmp_path))
+
+ @pytest.fixture
+ def mock_missing_module(self, monkeypatch: MonkeyPatch) -> None:
+ import_orig = builtins.__import__
+
+ def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
+ if name == "pandas":
+ raise ImportError()
+ return import_orig(name, *args, **kwargs)
+
+ monkeypatch.setattr(builtins, "__import__", mocked_import)
+
+ def test_mock_missing_module(
+ self, dataset: INaturalist, mock_missing_module: None
+ ) -> None:
+ with pytest.raises(
+ ImportError,
+ match="pandas is not installed and is required to use this dataset",
+ ):
+ INaturalist(dataset.root)
+
+ def test_invalid_query(self, dataset: INaturalist) -> None:
+ query = BoundingBox(0, 0, 0, 0, 0, 0)
+ with pytest.raises(
+ IndexError, match="query: .* not found in index with bounds:"
+ ):
+ dataset[query]
diff --git a/torchgeo/datasets/__init__.py b/torchgeo/datasets/__init__.py
index d3a5f9fe3f0..1e00587b898 100644
--- a/torchgeo/datasets/__init__.py
+++ b/torchgeo/datasets/__init__.py
@@ -48,6 +48,7 @@
from .gid15 import GID15
from .globbiomass import GlobBiomass
from .idtrees import IDTReeS
+from .inaturalist import INaturalist
from .inria import InriaAerialImageLabeling
from .landcoverai import LandCoverAI
from .landsat import (
@@ -121,6 +122,7 @@
"EUDEM",
"GBIF",
"GlobBiomass",
+ "INaturalist",
"Landsat",
"Landsat1",
"Landsat2",
diff --git a/torchgeo/datasets/inaturalist.py b/torchgeo/datasets/inaturalist.py
new file mode 100644
index 00000000000..1083be878b3
--- /dev/null
+++ b/torchgeo/datasets/inaturalist.py
@@ -0,0 +1,123 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Dataset for iNaturalist."""
+
+import glob
+import os
+import sys
+from typing import Any, Dict
+
+from rasterio.crs import CRS
+
+from .geo import GeoDataset
+from .utils import BoundingBox, disambiguate_timestamp
+
+
+class INaturalist(GeoDataset):
+ """Dataset for iNaturalist.
+
+ `iNaturalist `_ is a joint initiative of the
+ California Academy of Sciences and the National Geographic Society. It allows
+ citizen scientists to upload observations of organisms that can be downloaded by
+ scientists and researchers.
+
+ If you use an iNaturalist dataset in your research, please cite it according to:
+
+ * https://www.inaturalist.org/pages/help#cite
+
+ .. note::
+ This dataset requires the following additional library to be installed:
+
+ * `pandas `_ to load CSV files
+
+ .. versionadded:: 0.3
+ """
+
+ res = 0
+ _crs = CRS.from_epsg(4326) # Lat/Lon
+
+ def __init__(self, root: str = "data") -> None:
+ """Initialize a new Dataset instance.
+
+ Args:
+ root: root directory where dataset can be found
+
+ Raises:
+ FileNotFoundError: if no files are found in ``root``
+ ImportError: if pandas is not installed
+ """
+ super().__init__()
+
+ self.root = root
+
+ files = glob.glob(os.path.join(root, "**.csv"))
+ if not files:
+ raise FileNotFoundError(f"Dataset not found in `root={self.root}`")
+
+ try:
+ import pandas as pd # noqa: F401
+ except ImportError:
+ raise ImportError(
+ "pandas is not installed and is required to use this dataset"
+ )
+
+ # Read CSV file
+ data = pd.read_csv(
+ files[0],
+ engine="c",
+ usecols=["observed_on", "time_observed_at", "latitude", "longitude"],
+ )
+
+ # Dataset contains many possible timestamps:
+ #
+ # * observed_on_string: no consistent format (can't use)
+ # * observed_on: day precision (better)
+ # * time_observed_at: second precision (best)
+ # * created_at: when observation was submitted (shouldn't use)
+ # * updated_at: when submission was updated (shouldn't use)
+ #
+ # The created_at/updated_at timestamps can be years after the actual submission,
+ # so they shouldn't be used, even if observed_on/time_observed_at are missing.
+
+ # Convert from pandas DataFrame to rtree Index
+ i = 0
+ for date, time, y, x in data.itertuples(index=False, name=None):
+ # Skip rows without lat/lon
+ if pd.isna(y) or pd.isna(x):
+ continue
+
+ if not pd.isna(time):
+ mint, maxt = disambiguate_timestamp(time, "%Y-%m-%d %H:%M:%S %z")
+ elif not pd.isna(date):
+ mint, maxt = disambiguate_timestamp(date, "%Y-%m-%d")
+ else:
+ mint, maxt = 0, sys.maxsize
+
+ coords = (x, x, y, y, mint, maxt)
+ self.index.insert(i, coords)
+ i += 1
+
+ def __getitem__(self, query: BoundingBox) -> Dict[str, Any]:
+ """Retrieve metadata indexed by query.
+
+ Args:
+ query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index
+
+ Returns:
+ sample of metadata at that index
+
+ Raises:
+ IndexError: if query is not found in the index
+ """
+ hits = self.index.intersection(tuple(query), objects=True)
+ bboxes = [hit.bbox for hit in hits]
+
+ if not bboxes:
+ raise IndexError(
+ f"query: {query} not found in index with bounds: {self.bounds}"
+ )
+
+ sample = {"crs": self.crs, "bbox": bboxes}
+
+ return sample