Add dataset.services() method to list available services (#500)

Fixes #447 Co-authored-by: Matt Fisher <mfisher87@gmail.com> Co-authored-by: Jessica Scheick <JessicaS11@users.noreply.github.com> Co-authored-by: Luis López <luis.lopezespinosa@colorado.edu> Co-authored-by: Chuck Daniels <chuck@developmentseed.org>
nsidc · Sep 16, 2024 · 699cc4e · 699cc4e
1 parent c27b502
commit 699cc4e
Show file tree

Hide file tree

Showing 23 changed files with 4,724 additions and 220 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -33,6 +33,12 @@ jobs:
           poetry config virtualenvs.create true --local
           poetry config virtualenvs.in-project true --local
           poetry self add setuptools
+      - name: Set up mypy cache
+        uses: actions/cache@v4
+        id: mypy-cache
+        with:
+          path: .mypy_cache
+          key: mypy-${{ runner.os }}-${{ steps.full-python-version.outputs.version }}-${{ hashFiles('poetry.lock') }}
       - name: Set up cache
         uses: actions/cache@v4
         id: cache
@@ -44,12 +50,12 @@ jobs:
         run: poetry run pip --version >/dev/null 2>&1 || rm -rf .venv
       - name: Install Dependencies
         if: ${{ !env.ACT }}
-        run: poetry install
+        run: poetry install --quiet
       - name: Install Dependencies
         if: ${{ env.ACT }}
         # When using `act` to run the workflow locally, the `poetry install` command
         # may fail due to network issues when running multiple Docker containers.
-        run: poetry install || poetry install || poetry install
+        run: poetry install --quiet || poetry install --quiet || poetry install --quiet
       - name: Test
         run: poetry run bash scripts/test.sh
       - name: Upload coverage

diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/docs/howto/search-services.md b/docs/howto/search-services.md
@@ -0,0 +1,42 @@
+# How to search for services using `earthaccess`
+
+You can search for services associated with a dataset.  Services include a
+back-end processing workflow that transforms or processes the data in some way
+(e.g.  clipping to a spatial extent or converting to a different file format).
+
+`earthaccess` facilitates the retrieval of service metadata via the
+`search_datasets` function.  The results from the `search_datasets` method are
+an enhanced Python dictionary that includes a `services` method which returns
+the metadata for all services associated with a collection.  The service results
+are returned as a Python dictionary.
+
+To search for services, import the earthaccess library and search by dataset
+(you need to know the short name of the dataset which can be found on the
+dataset landing page):
+
+```py
+import earthaccess
+
+datasets = earthaccess.search_datasets(
+    short_name="MUR-JPL-L4-GLOB-v4.1",
+    cloud_hosted=True,
+    temporal=("2024-02-27T00:00:00Z", "2024-02-29T23:59:59Z"),
+)
+```
+
+Parse the service results to return metadata on services available for the dataset.
+
+```py
+for dataset in datasets:
+    print(dataset.services())
+```
+
+Alternatively, you may search directly for services.  For example:
+
+```py
+services = earthaccess.search_services(provider="POCLOUD", keyword="COG")
+```
+
+The keyword arguments supported by the `search_services` function are
+constrained to what the NASA CMR allows, as described in the
+[Service section of the CMR API](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html#service).
diff --git a/docs/user-reference/collections/collections-services.md b/docs/user-reference/collections/collections-services.md
@@ -0,0 +1,7 @@
+# Documentation for `Collection Services`
+
+::: earthaccess.DataServices
+    options:
+      inherited_members: true
+    show_root_heading: true
+    show_source: false
diff --git a/docs/user_guide/access.md b/docs/user_guide/access.md
@@ -7,7 +7,7 @@
     We are reorganizing and updating the documentation, so not all pages are complete.  If you are looking for information about accessing data using earthaccess see the
     HOW-TO pages below.
 
-    * [Quick start](../../quick-start/)
+    * [Quick start](../quick-start.md)
     * [How-to download data](../howto/onprem.md)
 
 ## Downloading data

diff --git a/docs/user_guide/authenticate.md b/docs/user_guide/authenticate.md
@@ -7,5 +7,5 @@ Introduces the `earthaccess.login` method for managing Earthdata Login and cloud
     We are reorganizing and updating the documentation, so not all pages are complete.  If you are looking for information about authenticating using earthaccess see the
     How-Tos and Tutorials in links below.
 
-    * [Quick start](../../quick-start/)
+    * [Quick start](../quick-start.md)
     * [How-To Authenticate with earthaccess](../howto/authenticate.md)
diff --git a/docs/user_guide/search.md b/docs/user_guide/search.md
@@ -5,7 +5,7 @@
     We are reorganizing and updating the documentation, so not all pages are complete.  If you are looking for information about authenticating using earthaccess see the
     How-Tos and Tutorials in links below.
 
-    * [Quick start](../../quick-start/)
+    * [Quick start](../quick-start.md)
     * [How-To Access Data](../howto/access-data.md)
 
 ## `search_datasets`

diff --git a/earthaccess/__init__.py b/earthaccess/__init__.py
@@ -17,10 +17,12 @@
     open,
     search_data,
     search_datasets,
+    search_services,
 )
 from .auth import Auth
 from .kerchunk import consolidate_metadata
 from .search import DataCollections, DataGranules
+from .services import DataServices
 from .store import Store
 from .system import PROD, UAT
 
@@ -31,6 +33,7 @@
     "login",
     "search_datasets",
     "search_data",
+    "search_services",
     "get_requests_https_session",
     "get_fsspec_https_session",
     "get_s3fs_session",
@@ -45,6 +48,7 @@
     # search.py
     "DataGranules",
     "DataCollections",
+    "DataServices",
     # auth.py
     "Auth",
     # store.py
@@ -70,26 +74,24 @@ def __getattr__(name):  # type: ignore
     """
     global _auth, _store
 
-    if name == "__auth__" or name == "__store__":
-        with _lock:
-            if not _auth.authenticated:
-                for strategy in ["environment", "netrc"]:
-                    try:
-                        _auth.login(strategy=strategy)
-                    except Exception as e:
+    if name not in ["__auth__", "__store__"]:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+    with _lock:
+        if not _auth.authenticated:
+            for strategy in ["environment", "netrc"]:
+                try:
+                    _auth.login(strategy=strategy)
+
+                    if _auth.authenticated:
+                        _store = Store(_auth)
                         logger.debug(
-                            f"An error occurred during automatic authentication with {strategy=}: {str(e)}"
+                            f"Automatic authentication with {strategy=} was successful"
                         )
-                        continue
-                    else:
-                        if not _auth.authenticated:
-                            continue
-                        else:
-                            _store = Store(_auth)
-                            logger.debug(
-                                f"Automatic authentication with {strategy=} was successful"
-                            )
-                            break
-            return _auth if name == "__auth__" else _store
-    else:
-        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+                        break
+                except Exception as e:
+                    logger.debug(
+                        f"An error occurred during automatic authentication with {strategy=}: {str(e)}"
+                    )
+
+        return _auth if name == "__auth__" else _store
diff --git a/earthaccess/api.py b/earthaccess/api.py
@@ -6,6 +6,7 @@
 from typing_extensions import Any, Dict, List, Optional, Union, deprecated
 
 import earthaccess
+from earthaccess.services import DataServices
 
 from .auth import Auth
 from .results import DataCollection, DataGranule
@@ -130,6 +131,34 @@ def search_data(count: int = -1, **kwargs: Any) -> List[DataGranule]:
     return query.get_all()
 
 
+def search_services(count: int = -1, **kwargs: Any) -> List[Any]:
+    """Search the NASA CMR for Services matching criteria.
+
+    See <https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html#service>.
+
+    Parameters:
+        count:
+            maximum number of services to fetch (if less than 1, all services
+            matching specified criteria are fetched [default])
+        kwargs:
+            keyword arguments accepted by the CMR for searching services
+
+    Returns:
+        list of services (possibly empty) matching specified criteria, in UMM
+        JSON format
+
+    Examples:
+        ```python
+        services = search_services(provider="POCLOUD", keyword="COG")
+        ```
+    """
+    query = DataServices(auth=earthaccess.__auth__).parameters(**kwargs)
+    hits = query.hits()
+    logger.info(f"Services found: {hits}")
+
+    return query.get(hits if count < 1 else min(count, hits))
+
+
 def login(strategy: str = "all", persist: bool = False, system: System = PROD) -> Auth:
     """Authenticate with Earthdata login (https://urs.earthdata.nasa.gov/).
 

diff --git a/earthaccess/results.py b/earthaccess/results.py
@@ -2,7 +2,10 @@
 import uuid
 from typing import Any, Dict, List, Optional, Union
 
+import earthaccess
+
 from .formatters import _repr_granule_html
+from .services import DataServices
 
 
 class CustomDict(dict):
@@ -178,6 +181,16 @@ def s3_bucket(self) -> Dict[str, Any]:
             return self["umm"]["DirectDistributionInformation"]
         return {}
 
+    def services(self) -> Dict[Any, List[Dict[str, Any]]]:
+        """Return list of services available for this collection."""
+        services = self.get("meta", {}).get("associations", {}).get("services", [])
+        queries = (
+            DataServices(auth=earthaccess.__auth__).parameters(concept_id=service)
+            for service in services
+        )
+
+        return {service: query.get_all() for service, query in zip(services, queries)}
+
     def __repr__(self) -> str:
         return json.dumps(
             self.render_dict, sort_keys=False, indent=2, separators=(",", ": ")

diff --git a/earthaccess/search.py b/earthaccess/search.py
@@ -21,61 +21,14 @@
 from .auth import Auth
 from .daac import find_provider, find_provider_by_shortname
 from .results import DataCollection, DataGranule
+from .utils._search import get_results
 
 logger = logging.getLogger(__name__)
 
 FloatLike: TypeAlias = Union[str, SupportsFloat]
 PointLike: TypeAlias = Tuple[FloatLike, FloatLike]
 
 
-def get_results(
-    session: requests.Session,
-    query: Union[CollectionQuery, GranuleQuery],
-    limit: int = 2000,
-) -> List[Any]:
-    """Get all results up to some limit, even if spanning multiple pages.
-
-    ???+ Tip
-        The default page size is 2000, if the supplied value is greater then the
-        Search-After header will be used to iterate across multiple requests until
-        either the limit has been reached or there are no more results.
-
-    Parameters:
-        limit: The number of results to return
-
-    Returns:
-        query results as a list
-
-    Raises:
-        RuntimeError: The CMR query failed.
-    """
-    page_size = min(limit, 2000)
-    url = query._build_url()
-
-    results: List[Any] = []
-    more_results = True
-    headers = dict(query.headers or {})
-
-    while more_results:
-        response = session.get(url, headers=headers, params={"page_size": page_size})
-
-        if cmr_search_after := response.headers.get("cmr-search-after"):
-            headers["cmr-search-after"] = cmr_search_after
-
-        try:
-            response.raise_for_status()
-        except requests.exceptions.HTTPError as ex:
-            raise RuntimeError(ex.response.text) from ex
-
-        latest = response.json()["items"]
-
-        results.extend(latest)
-
-        more_results = page_size <= len(latest) and len(results) < limit
-
-    return results
-
-
 class DataCollections(CollectionQuery):
     """Placeholder.
 

diff --git a/earthaccess/services.py b/earthaccess/services.py
@@ -0,0 +1,47 @@
+from typing import Any, List, Optional
+
+import requests
+
+from cmr import ServiceQuery
+
+from .auth import Auth
+from .utils import _search as search
+
+
+class DataServices(ServiceQuery):
+    """A Service client for NASA CMR that returns data on collection services.
+
+    API: https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html#service
+    """
+
+    _format = "umm_json"
+
+    def __init__(self, auth: Optional[Auth] = None, *args: Any, **kwargs: Any) -> None:
+        """Build an instance of DataService to query CMR.
+
+        auth is an optional parameter for queries that need authentication,
+        e.g. restricted datasets.
+
+        Parameters:
+            auth: An authenticated `Auth` instance.
+        """
+        super().__init__(*args, **kwargs)
+        self._debug = False
+
+        # To search, we need the new bearer tokens from NASA Earthdata
+        self.session = (
+            auth.get_session(bearer_token=True)
+            if auth is not None and auth.authenticated
+            else requests.sessions.Session()
+        )
+
+    def get(self, limit: int = 2000) -> List:
+        """Get all service results up to some limit.
+
+        Parameters
+            limit (int): The number of results to return
+
+        Returns:
+            Query results as a list
+        """
+        return search.get_results(self.session, self, limit)