-
Notifications
You must be signed in to change notification settings - Fork 5
Additional Examples
Additional Questions and Answers and Howtos regarding the search API
There are at least two possible methods.
- Using the iDigBio search API directly:
https://search.idigbio.org/v2/search/records/?rq={"data.dwc:dynamicProperties":"nsf_tcn"}&limit=1
The same url, with proper url-encoding of special characters:
At the time this example was created, we found 104340 records that contain 'nsf_tcn' in dynamicProperties.
The itemCount field returns the number of records that matched the query, regardless of whether the results are constrained by the limit
parameter.
Results of above query look like the following (truncated for brevity) when fetched using HTTPie:
$ http 'https://search.idigbio.org/v2/search/records/?rq={"data.dwc:dynamicProperties":"nsf_tcn"}&limit=1'
{
"itemCount": 104340,
"items": [
{
"data": {
"dcterms:language": "en",
"dcterms:license": "https://creativecommons.org/publicdomain/zero/1.0/",
"dcterms:modified": "2018-03-14 12:27:59",
"dcterms:references": "http://www.burkemuseum.org/collections/search/result.php?GUID=UWBM:InvertebratePaleontology:66079",
"dcterms:rightsHolder": "University of Washington Burke Museum",
"dcterms:type": "PhysicalObject",
"dwc:Identification": [
{
"coreid": "urn:catalog:UWBM:InvertebratePaleontology:66079",
"dwc:class": "Gastropoda",
"dwc:family": "Buccinidae",
"dwc:genus": "Parvisipho",
"dwc:higherClassification": "Mollusca; Gastropoda; Neogastropoda; Buccinidae",
"dwc:nomenclaturalCode": "ICZN",
"dwc:order": "Neogastropoda",
"dwc:phylum": "Mollusca",
"dwc:scientificName": "Parvisipho lewisiana (Weaver), 1912",
"dwc:scientificNameAuthorship": "(Weaver), 1912",
"dwc:specificEpithet": "lewisiana",
"dwc:taxonRank": "species"
}
],
"dwc:basisOfRecord": "FossilSpecimen",
"dwc:catalogNumber": "66079",
"dwc:class": "Gastropoda",
"dwc:collectionCode": "Invertebrate Paleontology",
"dwc:continent": "North America",
"dwc:coordinateUncertaintyInMeters": "5566",
"dwc:country": "United States",
"dwc:county": "Lewis",
"dwc:dataGeneralizations": "Latitude and longitude reduced in precision by 0.1 degrees.",
"dwc:decimalLatitude": "46.45",
"dwc:decimalLongitude": "-122.95",
"dwc:dynamicProperties": "{\"NSF_TCN\":\"EPICC\",\"Elements\":\"conch;cone\",\"CollectingParty\":\"David Nunnallee\"}",
"dwc:earliestEpochOrLowestSeries": "Eocene",
"dwc:earliestEraOrLowestErathem": "Cenozoic",
"dwc:earliestPeriodOrLowestSystem": "Paleogene",
"dwc:eventDate": "1967-00-00",
"dwc:family": "Buccinidae",
"dwc:fieldNumber": "25, 28",
"dwc:formation": "Cowlitz",
"dwc:genus": "Parvisipho",
"dwc:geodeticDatum": "WGS 84",
"dwc:georeferenceSources": "GEOLocate",
"dwc:georeferencedBy": "Hindle, Sawyer",
"dwc:georeferencedDate": "2017-04-17 14:39:21",
"dwc:higherClassification": "Mollusca; Gastropoda; Neogastropoda; Buccinidae",
"dwc:higherGeography": "North America; United States; Washington; Lewis County",
"dwc:highestBiostratigraphicZone": "Narizian",
"dwc:individualCount": "3",
"dwc:informationWithheld": "Precise coordinates and locality withheld to protect sensitive information.",
"dwc:institutionCode": "UWBM",
"dwc:institutionID": "http://biocol.org/urn:lsid:biocol.org:col:34878",
"dwc:latestEpochOrHighestSeries": "Eocene",
"dwc:latestEraOrHighestErathem": "Cenozoic",
"dwc:latestPeriodOrHighestSystem": "Paleogene",
"dwc:locality": "[redacted]",
"dwc:locationID": "A9522",
"dwc:lowestBiostratigraphicZone": "Narizian",
"dwc:nomenclaturalCode": "ICZN",
"dwc:occurrenceID": "urn:catalog:UWBM:InvertebratePaleontology:66079",
"dwc:occurrenceRemarks": "2 complete one with bore hole, one apex missing, one siphon missing Nunnallee Locality 28 = A9522-2.",
"dwc:order": "Neogastropoda",
"dwc:otherCatalogNumbers": "Accession: 1988-11, Other numbers: W659",
"dwc:ownerInstitutionCode": "UWBM",
"dwc:phylum": "Mollusca",
"dwc:recordNumber": "586, 724",
"dwc:recordedBy": "David Nunnallee",
"dwc:scientificName": "Parvisipho lewisiana (Weaver), 1912",
"dwc:scientificNameAuthorship": "(Weaver), 1912",
"dwc:specificEpithet": "lewisiana",
"dwc:stateProvince": "Washington",
"dwc:taxonRank": "species",
"dwc:verbatimEventDate": "1967-1970",
"dwc:year": "1967",
"id": "urn:catalog:UWBM:InvertebratePaleontology:66079"
},
"etag": "b8cd7ff265fdc9cc8cc36efbc1874dd8fa45934c",
"type": "records",
"uuid": "061594f4-69a3-41ff-9396-dac55cc8409b"
}
],
"lastModified": "2018-06-04T09:35:41.809Z"
}
If you want to retrieve large result sets that exceed 100,000 records you can use your choice of scripting language using the following method. This code block is written in Python 3.
import json
import requests
import time
query = {
"rq": {
"genus": "euphorbia",
"uuid": {
"type": "range",
"gt": "00000000-0000-0000-0000-000000000000",
"lte": "ffffffff-ffff-ffff-ffff-ffffffffffff"
}
},
"sort": [
{
"uuid": "asc"
}
],
"limit": 100,
"offset": 0
}
inc = 1
while True:
response = requests.post("http://search.idigbio.org/v2/search/records/", json=query)
if response.status_code != 200:
time.sleep(2)
continue
else:
#do whatever you need with records here records['items'][0..n]
with open('data-' + str(inc).zfill(5) + '.json', 'w') as outfile:
outfile.write(response.text)
try:
records = response.json(strict=False)
except ValueError:
continue
inc = inc + 1
#if this is the last page of records
if records['itemCount'] <= query['limit']:
break;
else:
#Start with next block of records
size = len(records['items'])
query['rq']['uuid']['gt'] = records['items'][size-1]['uuid']
If you want to download the media for a query this Python 3 script is a good starting place:
import json
import requests
import time
import uuid
query = {
"rq": {
"genus": "euphorbia",
"hasImage": True,
"uuid": {
"type": "range",
"gt": "00000000-0000-0000-0000-000000000000",
"lte": "ffffffff-ffff-ffff-ffff-ffffffffffff"
}
},
"sort": [
{ "uuid": "asc" }
],
"limit": 100,
"offset": 0
}
inc = 1
while True:
try:
response = requests.post("http://search.idigbio.org/v2/search/records/", json=query)
except requests.exceptions.RequestException as e:
print("Not able to send API request! ")
raise SystemExit(e)
if response.status_code == 200:
with open('test-' + str(inc).zfill(5) + '.json', 'w') as outfile:
outfile.write(response.text)
try:
records = response.json(strict=False)
except ValueError:
continue
for rec in records['items']:
if 'mediarecords' in rec['indexTerms']:
for uuidstr in rec['indexTerms']['mediarecords']:
au = uuid.UUID(uuidstr)
r = requests.get("http://search.idigbio.org/v2/view/mediarecords/" + str(au))
try:
mm = r.json(strict=False)
except ValueError:
print("Failed to retrieve mediarecord: " + str(au))
continue
#simplified and possibly incomplete determination of the correct associated URL for a mediarecord UUID
if 'ac:accessURI' in mm['data']:
url = mm['data']['ac:accessURI']
print(mm['data']['ac:accessURI'])
else:
if 'dcterms:identifier' in mm['data']:
if 'http' in mm['data']['dcterms:identifier']:
url = mm['data']['dcterms:identifier']
print(mm['data']['dcterms:identifier'])
try:
print("getting the URL: " + url)
imgresp = requests.get(url)
except requests.exceptions.RequestException as e:
print("Couldn't retrieve URL: " + url)
continue
with open('img-' + str(au), 'wb') as imgfile:
imgfile.write(imgresp.content)
inc = inc + 1
#if this is the last page of records
if records['itemCount'] <= query['limit']:
break
else:
#Start with next block of records
size = len(records['items'])
query['rq']['uuid']['gt'] = records['items'][size-1]['uuid']
else:
print("response code not 200: " + response.status_code)
break
If you wanted to download the records bundled in an archive you could use that same query with the Download API.
https://www.idigbio.org/wiki/index.php/IDigBio_Download_API
- Using a client library, such as the iDigBio R client, a similar query can be created by using the examples at the following link (replace "parasite" with "nsf_tcn").
https://rdrr.io/cran/ridigbio/man/idig_search_records.html
TBD