diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5ee5afe..3c694ec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,9 +1,10 @@ exclude: | (?x)^( - notebooks/GTS_Totals_weather_act.ipynb| - notebooks/IOOS_BTN.ipynb| btn_metrics.py| gts_atn_metrics.py| + notebooks/GTS_Totals_weather_act.ipynb| + notebooks/IOOS_BTN.ipynb| + notebooks/mbon_citation_visualizations.ipynb| read_bufr.py| website/.* )$ @@ -19,7 +20,7 @@ repos: - id: check-added-large-files - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell exclude: > @@ -35,7 +36,7 @@ repos: - id: add-trailing-comma - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.3 + rev: v0.4.6 hooks: - id: ruff args: ["--fix", "--show-fixes"] diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py index 3814d8f..b84b23d 100644 --- a/ioos_metrics/ioos_metrics.py +++ b/ioos_metrics/ioos_metrics.py @@ -11,7 +11,7 @@ from bs4 import BeautifulSoup from fake_useragent import UserAgent from gliderpy.fetchers import GliderDataFetcher -from shapely.geometry import LineString +from shapely.geometry import LineString, Point from ioos_metrics.national_platforms import national_platforms @@ -206,6 +206,18 @@ def _metadata(info_df) -> dict: ), } + def _make_track_geom(df) -> "pd.DataFrame": + geom = Point if df.shape[0] == 1 else LineString + + return geom( + (lon, lat) + for (lon, lat) in zip( + df["longitude (degrees_east)"], + df["latitude (degrees_north)"], + strict=False, + ) + ) + def _computed_metadata(dataset_id) -> dict: """Download the minimum amount of data possible for the computed metadata. @@ -220,16 +232,20 @@ def _computed_metadata(dataset_id) -> dict: "longitude", "time", ] - df = glider_grab.to_pandas() + df = glider_grab.fetcher.to_pandas(distinct=True) + df["time (UTC)"] = pd.to_datetime(df["time (UTC)"]) + df = df.set_index("time (UTC)") df = df.sort_index() + track = _make_track_geom(df) days = df.index[-1].ceil("D") - df.index[0].floor("D") return { - "deployment_lat": df["latitude"].iloc[0], - "deployment_lon": df["longitude"].iloc[0], + "deployment_lat": df["latitude (degrees_north)"].iloc[0], + "deployment_lon": df["longitude (degrees_east)"].iloc[0], "num_profiles": len(df), # Profiles are not unique! Cannot use this!! # "num_profiles": len(set(df['profile_id'])) "days": days, + "track": track, } glider_grab = GliderDataFetcher() @@ -245,21 +261,8 @@ def _computed_metadata(dataset_id) -> dict: ) metadata = {} - glider_grab.fetcher.variables = ["longitude", "latitude"] for _, row in list(df.iterrows()): dataset_id = row["Dataset ID"] - - glider_grab.fetcher.dataset_id = dataset_id - track = glider_grab.fetcher.to_pandas(distinct=True) - track = LineString( - (lon, lat) - for (lon, lat) in zip( - track["longitude (degrees_east)"], - track["latitude (degrees_north)"], - strict=False, - ) - ) - info_url = row["info_url"].replace("html", "csv") info_df = pd.read_csv(info_url) info = _metadata(info_df) @@ -271,7 +274,6 @@ def _computed_metadata(dataset_id) -> dict: "This could be a server side error and the metrics will be incomplete!", ) continue - info.update({"track": track}) metadata.update({dataset_id: info}) return pd.DataFrame(metadata).T @@ -554,68 +556,71 @@ def hf_radar_installations(): # This is a hardcoded number at the moment! return 165 + @functools.lru_cache(maxsize=128) def mbon_stats(): - """ - This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity + """Collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity Information System (OBIS) and the Global Biodiversity Information Framework (GBIF). The function returns a dataframe with rows corresponding to each paper citing a dataset. """ - import pyobis import urllib.parse + import pyobis + # collect dataset information from OBIS institution_id = 23070 query = pyobis.dataset.search(instituteid=institution_id) df = pd.DataFrame(query.execute()) df_obis = pd.DataFrame.from_records(df["results"]) - df_obis.columns = [f'obis_{col}' for col in df_obis.columns] + df_obis.columns = [f"obis_{col}" for col in df_obis.columns] df_mapping = pd.DataFrame() - base_url = 'https://api.gbif.org' + base_url = "https://api.gbif.org" # iterate through each OBIS dataset to gather uuid from GBIF # create a mapping table - for title in df_obis['obis_title']: + for title in df_obis["obis_title"]: string = title - query = f'{base_url}/v1/dataset/search?q={urllib.parse.quote(string)}' - df = pd.read_json(query, orient='index').T + query = f"{base_url}/v1/dataset/search?q={urllib.parse.quote(string)}" + df = pd.read_json(query, orient="index").T # build a DataFrame with the info we need more accessible - df_mapping = pd.concat([df_mapping, pd.DataFrame({ - 'gbif_uuid': df['results'].values[0][0]['key'], - 'title': [df['results'].values[0][0]['title']], - 'obis_id': [df_obis.loc[df_obis['obis_title']==title,'obis_id'].to_string(index=False)], - 'doi': [df['results'].values[0][0]['doi']] - })], ignore_index=True) - + df_mapping = pd.concat( + [ + df_mapping, + pd.DataFrame( + { + "gbif_uuid": df["results"].to_numpy()[0][0]["key"], + "title": [df["results"].to_numpy()[0][0]["title"]], + "obis_id": [df_obis.loc[df_obis["obis_title"] == title, "obis_id"].to_string(index=False)], + "doi": [df["results"].to_numpy()[0][0]["doi"]], + }, + ), + ], + ignore_index=True, + ) df_gbif = pd.DataFrame() - for key in df_mapping['gbif_uuid']: - - url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}'.format(key) - df2 = pd.read_csv(url) # collect liturature cited information - df2.columns = ['literature_' + str(col) for col in df2.columns] - df2['gbif_uuid'] = key + for key in df_mapping["gbif_uuid"]: + url = f"https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={key}" + df2 = pd.read_csv(url) # collect literature cited information + df2.columns = ["literature_" + str(col) for col in df2.columns] + df2["gbif_uuid"] = key - df_gbif = pd.concat([df2,df_gbif], ignore_index=True) + df_gbif = pd.concat([df2, df_gbif], ignore_index=True) # merge the OBIS and GBIF data frames together - df_obis = df_obis.merge(df_mapping, on='obis_id') + df_obis = df_obis.merge(df_mapping, on="obis_id") # add gbif download stats - for key in df_obis['gbif_uuid']: - url = f'https://api.gbif.org/v1/occurrence/download/statistics/export?datasetKey={key}' - df2 = pd.read_csv(url,sep='\t') - df2_group = df2.groupby('year').agg({'number_downloads':'sum'}) - - df_obis.loc[df_obis['gbif_uuid']==key,'gbif_downloads'] = str(df2_group.to_dict()) - - df_out = df_gbif.merge(df_obis, on='gbif_uuid') - - return df_out + for key in df_obis["gbif_uuid"]: + url = f"https://api.gbif.org/v1/occurrence/download/statistics/export?datasetKey={key}" + df2 = pd.read_csv(url, sep="\t") + df2_group = df2.groupby("year").agg({"number_downloads": "sum"}) + df_obis.loc[df_obis["gbif_uuid"] == key, "gbif_downloads"] = str(df2_group.to_dict()) + return df_gbif.merge(df_obis, on="gbif_uuid") def update_metrics(*, debug=False): diff --git a/pyproject.toml b/pyproject.toml index 16718ce..773928b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "pandas", "pdfminer.six", "pyarrow", + "pyobis", "requests", ] [project.urls] diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 093c19b..6cea585 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -74,24 +74,57 @@ def test_update_metrics(): df = update_metrics(debug=True) df.to_csv("updated_metrics.csv") + def test_mbon_stats(): df = ioos_metrics.mbon_stats() - columns = ['literature_title', 'literature_authors', 'literature_source', - 'literature_discovered', 'literature_published', - 'literature_open_access', 'literature_peer_review', - 'literature_citation_type', 'literature_countries_of_coverage', - 'literature_countries_of_researcher', 'literature_keywords', - 'literature_literature_type', 'literature_websites', - 'literature_identifiers', 'literature_id', 'literature_abstract', - 'literature_topics', 'literature_added', 'literature_gbif_download_key', - 'gbif_uuid', 'obis_id', 'obis_url', 'obis_archive', 'obis_published', - 'obis_created', 'obis_updated', 'obis_core', 'obis_extensions', - 'obis_statistics', 'obis_extent', 'obis_title', 'obis_citation', - 'obis_citation_id', 'obis_abstract', 'obis_intellectualrights', - 'obis_feed', 'obis_institutes', 'obis_contacts', 'obis_nodes', - 'obis_keywords', 'obis_downloads', 'obis_records', 'title', 'doi', - 'gbif_downloads'] + columns = [ + "literature_title", + "literature_authors", + "literature_source", + "literature_discovered", + "literature_published", + "literature_open_access", + "literature_peer_review", + "literature_citation_type", + "literature_countries_of_coverage", + "literature_countries_of_researcher", + "literature_keywords", + "literature_literature_type", + "literature_websites", + "literature_identifiers", + "literature_id", + "literature_abstract", + "literature_topics", + "literature_added", + "literature_gbif_download_key", + "gbif_uuid", + "obis_id", + "obis_url", + "obis_archive", + "obis_published", + "obis_created", + "obis_updated", + "obis_core", + "obis_extensions", + "obis_statistics", + "obis_extent", + "obis_title", + "obis_citation", + "obis_citation_id", + "obis_abstract", + "obis_intellectualrights", + "obis_feed", + "obis_institutes", + "obis_contacts", + "obis_nodes", + "obis_keywords", + "obis_downloads", + "obis_records", + "title", + "doi", + "gbif_downloads", + ] assert isinstance(df, pd.DataFrame) - assert all([col in df.columns for col in columns]) - assert not df.empty \ No newline at end of file + assert all(col in df.columns for col in columns) + assert not df.empty