Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat gbif parser #1

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,5 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
#.idea/
api2gn/dev.md
30 changes: 27 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,33 @@
# Installation

#Depuis le venv de GeoNature
source <GeoNature_DIR>/backend/venv/bin/activate
```
# Récupération de l'archive
wget https://github.com/PnX-SI/api2GN/archive/1.0.0.rc1.zip
unzip 1.0.0.rc1.zip
rm 1.0.0.rc1.zip
mv api2GN-1.0.0.rc1/ api2GN

# Depuis le venv de GeoNature
cd geonature
source backend/venv/bin/activate

# depuis le répertoire API2GN
pip install .
pip install .
pip install -e .
-> as mode dev

# Ajout du module dans l'interface admin geonature
geonature install-gn-module ~/api2GN API2GN

# Update de la DB
geonature db upgrade api2gn@head

```

Copier le fichier d'exemple `var/config/parsers.example.py` en `var/config/parsers.py`
```
cp var/config/parsers.example.py var/config/parsers.py
```

## Commandes

Expand All @@ -18,6 +40,8 @@ Copier le fichier d'exemple `var/config/parsers.example.py` en `var/config/parse
- Lancer un parser
```
geonature parser run <PARSER_NAME>
geonature parser run <PARSER_NAME> --dry-run
-> pour ne pas impacter le test en bdd
```

### Créer ses propres parser
Expand Down
1 change: 1 addition & 0 deletions api2gn/.env.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Store here env var use in your custom parser
216 changes: 216 additions & 0 deletions api2gn/gbif_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
from pygbif import occurrences, registry, species
from geojson import Feature
from shapely import wkt
from sqlalchemy.sql import func
from geoalchemy2.shape import from_shape
from api2gn.parsers import JSONParser
import json
import requests


def fetch_taxref_cd_nom(self):
try:
# print('fetch_taxref_cd_nom self')
response = occurrences.get(self['key'])
# print(response['taxonKey'])
# print('fetch_taxref_cd_nom self key')
url = "https://taxref.mnhn.fr/api/taxa/findByExternalId?externalDbId=gbif&externalId=" + str(response['taxonKey'])

response = requests.get(url)
response.raise_for_status()
response = response.json()
# print('fetch_taxref_cd_nom')
# print(response['referenceId'])

if len(str(response['referenceId'])) == 0:
return '183716 ' ## animalia
else:
return str(response['referenceId'])
except Exception as e:
# Handle the error and return a default value or an error code
print(f"An error occurred: {e}")
response = occurrences.get(self['key'])
url = "https://taxref.mnhn.fr/api/taxa/fuzzyMatch?term=" + str(response['scientificName'])
print(response)

response = requests.get(url)
response.raise_for_status()
response = response.json()

# Access the referenceId in the JSON response
reference_id = response['_embedded']['taxa'][0]['referenceId']

# Return the referenceId as a string
return str(reference_id)


class GBIFParser(JSONParser):
srid = 4326
progress_bar = False # useless multiple single request
occurrence_ids = [] # Occurence par défaut

def __init__(self):
print(self.occurrence_ids)
self.api_filters = {**GBIFParser.api_filters, **self.api_filters}
self.mapping = {**GBIFParser.mapping, **self.mapping}
self.constant_fields = {
**GBIFParser.constant_fields,
**self.constant_fields,
}
# Initialize the parent class
super().__init__()

self.occurrence_id = None # Initialisation
self.data = None
self.organization_data = None
self.dataset_data = None
self.species_data = None
self.subdivisions_data = None
self.cd_nom = None

self.validate_maping()

if len(self.occurrence_ids) < 1 :
self.occurrence_ids = self.fetch_occurrence_ids_search()
print("Liste des IDs d'occurrence :", self.occurrence_ids)


def fetch_occurrence_ids_search(self):
occurrence_ids = []
print(self.api_filters )
response = occurrences.search(datasetKey=self.api_filters['datasetKey'],geometry=self.api_filters['wkt'], limit=self.api_filters['limit'])
print('data fetch_occurrence_ids_search')
print(response)
if 'results' in response and len(response['results']) > 0:
for result in response['results']:
# Accéder directement à la clé du deuxième niveau
if 'key' in result:
occurrence_ids.append(result['key'])
print(occurrence_ids)
return occurrence_ids
elif 'data' in response:
return response['data']
elif len(response) > 0:
for result in response:
if 'key' in result:
occurrence_ids.append(result['key'])
# print(occurrence_ids)
return occurrence_ids
else:
raise ValueError(f"Failed to fetch data for search ")

def fetch_occurrence_data(self, occurrence_id):
response = occurrences.get(occurrence_id)
print(response) # Imprime la réponse complète pour inspecter la structure
if 'results' in response and len(response['results']) > 0:
return response['results'][0]
elif 'data' in response:
return response['data']
elif len(response) > 0:
return response
else:
raise ValueError(f"Failed to fetch data for occurrence ID {occurrence_id}")

def fetch_organization_data(self):
organization_key = self.data.get('publishingOrgKey')
print(organization_key)
if organization_key:
return registry.organizations(uuid=organization_key)
return {}

def fetch_dataset_data(self):
dataset_key = self.data.get('datasetKey')
if dataset_key:
return registry.datasets(uuid=dataset_key)
return {}

def fetch_species_data(self):
taxon_key = self.data.get('taxonKey')
print('fetch_species_data')
print(taxon_key)
if taxon_key:
return species.name_usage(key=taxon_key)
return {}


def fetch_subdivisions_data(self):
url = "https://api.gbif.org/v1/geocode/gadm/FRA.3_1/subdivisions"
response = requests.get(url)
response.raise_for_status()
return response.json()

@property
def items(self):
return [self.data] # Return data as a list with a single item

@property
def total(self):
return len(self.occurrence_ids) # Nombre total d'occurrences

@property
def total(self):
return len(self.occurrence_ids) # Nombre total d'occurrences

def get_geom(self, row):
if 'decimalLatitude' in row and 'decimalLongitude' in row:
point = f"POINT({row['decimalLongitude']} {row['decimalLatitude']})"
geom = wkt.loads(point)
return from_shape(geom, srid=4326)
return None

def integrate_data(self):
integrated_data = self.data.copy()
integrated_data['organization'] = self.organization_data
integrated_data['dataset'] = self.dataset_data
integrated_data['species'] = self.species_data
integrated_data['cd_nom'] = self.cd_nom
# integrated_data['subdivisions'] = self.subdivisions_data
return integrated_data

def next_row(self):
for occurrence_id in self.occurrence_ids:
self.occurrence_id = occurrence_id
self.data = self.fetch_occurrence_data(occurrence_id)
self.organization_data = self.fetch_organization_data()
self.dataset_data = self.fetch_dataset_data()
self.species_data = self.fetch_species_data()
self.subdivisions_data = self.fetch_subdivisions_data()
yield self.data

# Surcouchage pour test
# def run(self, dry_run):
# if dry_run:
# print("Running in dry run mode")
# integrated_data = self.integrate_data()

# print(integrated_data)
# print(self.total)
# for item in self.items:
# print(self.get_geom(item))
# else:
# print("Running in normal mode")

### Mapping a améliorer
mapping = {
# "unique_id_sinp" : "identifier",
"date_min": "eventDate",
"date_max": "eventDate",
"nom_cite": "scientificName",
"count_min": 1,
"count_max": 1,
"observers": "recordedBy",
"determiner": "recordedBy",
"meta_create_date": "eventDate",
"meta_update_date": "eventDate",
"place_name": "verbatimLocality",
"entity_source_pk_value": "catalogNumber",
}
dynamic_fields = {
# "unique_dataset_id" : "69f26484-08b6-4ccf-aeeb-42124d124fa1", # JDD test Inaturalist
# "id_dataset" : 705
# "occurence_id" : "4407389321",
# "altitude_min": my_custom_func

"cd_nom": fetch_taxref_cd_nom
}

8 changes: 8 additions & 0 deletions api2gn/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,14 @@ def run(self, dry_run=False):
class JSONParser(Parser):
limit = 100

def validate_maping(self):
"""
Validate the mapping throw the model (only Synthese model implemented)
"""
MappingValidator(
{**self.mapping, **self.constant_fields, **self.dynamic_fields}
).validate()

def get_geom(self, row):
"""
Must return a wkb geom
Expand Down
74 changes: 74 additions & 0 deletions api2gn/var/config/parsers_gbif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from api2gn.parsers import WFSParser
from api2gn.geonature_parser import GeoNatureParser
from api2gn.gbif_parser import GBIFParser

# Fichier a renommer en parsers.py pour fonctionner car parsers.py dans .gitignore
# Amélioration a faire, un fichier par parser

### Exemple func
# def my_custom_func(value):
# """
# Custom function to fill "observers" synthese with a depending api value
# """
# if value == "Org 1":
# return "Observateur inconnu"
# else:
# return "Observateur 1"


class GBIFParserInaturalist(GBIFParser):
name = "GBIF_INaturalist"
description = "Le Parser GBIF_INaturalist permet de récupérer les données en provenance de INaturalist depuis la plateforme du GBIF. Vous pouvez mettre un JDD et une zone geographique, ou une liste d'identifiants"
# url = "" # pas nécessaire car usage de la lib pygbif

limit = 100 # Limit du parser, mettre équivalent du limit de l'API
# filter api search occurences
# Mettre en commentaire pour utiliser la recherche par id_occurence
## Vous pouvez remplacer les valeurs pour filtrer
api_filters = {
"datasetKey": "50c9509d-22c7-4a22-a47d-8c48425ef4a7", ## INaturalist research grade
"wkt" : "POLYGON((-5.3685 46.16181,-0.53236 46.16181,-0.53236 49.21621,-5.3685 49.21621,-5.3685 46.16181))", ## Polygon Bretagne -> get via url de https://www.gbif.org/occurrence/map?
"limit" : "100" ,
}

# Mettre en commentaire pour utiliser la recherche API
## Vous pouvez remplacer par une liste dynamique d'IDs
# occurrence_ids = [4508012001]
# occurrence_ids = [4508012001, 4507897058, 4507948047, 4507718106, 4507942081]

#--> La recherche par id est prioritaire sur la recherche par filtre

### Exemple dynamic_fields
# dynamic_fields = {
# # "unique_dataset_id" : "69f26484-08b6-4ccf-aeeb-42124d124fa1", # JDD test Inaturalist
# # "id_dataset" : 705
# # # "occurence_id" : "4407389321",
# # "altitude_min": my_custom_func
# }

# override existant GeoNatureParser mapping
# the key is the name of the column is synthese
# the value could be a str of the column in the API or a dict for a custom value

mapping = {
# "unique_id_sinp": "xxx",
# "unique_id_sinp_grp": "xxx",
"date_min": "eventDate",
"date_max": "eventDate",
"nom_cite": "scientificName",
"observers": "recordedBy",
"determiner": "recordedBy",
"meta_create_date": "eventDate",
"meta_update_date": "eventDate",
"place_name": "verbatimLocality",
}

# pass constant from missing value in my API
constant_fields = {
"id_source": 16, # a creer ou a récupérer depuis metadonnées
"id_dataset": 705, # Creer JDD test, a terme récupérer les métadonnées et creer JDD en auto
"count_min": 1, # Non disponible dans api
"count_max": 1, # Non disponible dans api
# "cd_nom": 4001,

}
3 changes: 2 additions & 1 deletion requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ requests
marshmallow
pygml
geonature>2.12.0
tqdm
tqdm
pygbif