diff --git a/.gitignore b/.gitignore index 8cc188d..763b37d 100644 --- a/.gitignore +++ b/.gitignore @@ -163,4 +163,5 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ \ No newline at end of file +#.idea/ +api2gn/dev.md diff --git a/README.md b/README.md index d44ae64..24c6e2c 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,33 @@ # Installation - #Depuis le venv de GeoNature - source /backend/venv/bin/activate +``` + # Récupération de l'archive + wget https://github.com/PnX-SI/api2GN/archive/1.0.0.rc1.zip + unzip 1.0.0.rc1.zip + rm 1.0.0.rc1.zip + mv api2GN-1.0.0.rc1/ api2GN + + # Depuis le venv de GeoNature + cd geonature + source backend/venv/bin/activate + # depuis le répertoire API2GN - pip install . + pip install . + pip install -e . + -> as mode dev + + # Ajout du module dans l'interface admin geonature + geonature install-gn-module ~/api2GN API2GN + + # Update de la DB + geonature db upgrade api2gn@head + +``` Copier le fichier d'exemple `var/config/parsers.example.py` en `var/config/parsers.py` +``` +cp var/config/parsers.example.py var/config/parsers.py +``` ## Commandes @@ -18,6 +40,8 @@ Copier le fichier d'exemple `var/config/parsers.example.py` en `var/config/parse - Lancer un parser ``` geonature parser run + geonature parser run --dry-run + -> pour ne pas impacter le test en bdd ``` ### Créer ses propres parser diff --git a/api2gn/.env.sample b/api2gn/.env.sample new file mode 100644 index 0000000..548da30 --- /dev/null +++ b/api2gn/.env.sample @@ -0,0 +1 @@ +# Store here env var use in your custom parser \ No newline at end of file diff --git a/api2gn/gbif_parser.py b/api2gn/gbif_parser.py new file mode 100644 index 0000000..d1f0a0f --- /dev/null +++ b/api2gn/gbif_parser.py @@ -0,0 +1,216 @@ +from pygbif import occurrences, registry, species +from geojson import Feature +from shapely import wkt +from sqlalchemy.sql import func +from geoalchemy2.shape import from_shape +from api2gn.parsers import JSONParser +import json +import requests + + +def fetch_taxref_cd_nom(self): + try: + # print('fetch_taxref_cd_nom self') + response = occurrences.get(self['key']) + # print(response['taxonKey']) + # print('fetch_taxref_cd_nom self key') + url = "https://taxref.mnhn.fr/api/taxa/findByExternalId?externalDbId=gbif&externalId=" + str(response['taxonKey']) + + response = requests.get(url) + response.raise_for_status() + response = response.json() + # print('fetch_taxref_cd_nom') + # print(response['referenceId']) + + if len(str(response['referenceId'])) == 0: + return '183716 ' ## animalia + else: + return str(response['referenceId']) + except Exception as e: + # Handle the error and return a default value or an error code + print(f"An error occurred: {e}") + response = occurrences.get(self['key']) + url = "https://taxref.mnhn.fr/api/taxa/fuzzyMatch?term=" + str(response['scientificName']) + print(response) + + response = requests.get(url) + response.raise_for_status() + response = response.json() + + # Access the referenceId in the JSON response + reference_id = response['_embedded']['taxa'][0]['referenceId'] + + # Return the referenceId as a string + return str(reference_id) + + +class GBIFParser(JSONParser): + srid = 4326 + progress_bar = False # useless multiple single request + occurrence_ids = [] # Occurence par défaut + + def __init__(self): + print(self.occurrence_ids) + self.api_filters = {**GBIFParser.api_filters, **self.api_filters} + self.mapping = {**GBIFParser.mapping, **self.mapping} + self.constant_fields = { + **GBIFParser.constant_fields, + **self.constant_fields, + } + # Initialize the parent class + super().__init__() + + self.occurrence_id = None # Initialisation + self.data = None + self.organization_data = None + self.dataset_data = None + self.species_data = None + self.subdivisions_data = None + self.cd_nom = None + + self.validate_maping() + + if len(self.occurrence_ids) < 1 : + self.occurrence_ids = self.fetch_occurrence_ids_search() + print("Liste des IDs d'occurrence :", self.occurrence_ids) + + + def fetch_occurrence_ids_search(self): + occurrence_ids = [] + print(self.api_filters ) + response = occurrences.search(datasetKey=self.api_filters['datasetKey'],geometry=self.api_filters['wkt'], limit=self.api_filters['limit']) + print('data fetch_occurrence_ids_search') + print(response) + if 'results' in response and len(response['results']) > 0: + for result in response['results']: + # Accéder directement à la clé du deuxième niveau + if 'key' in result: + occurrence_ids.append(result['key']) + print(occurrence_ids) + return occurrence_ids + elif 'data' in response: + return response['data'] + elif len(response) > 0: + for result in response: + if 'key' in result: + occurrence_ids.append(result['key']) + # print(occurrence_ids) + return occurrence_ids + else: + raise ValueError(f"Failed to fetch data for search ") + + def fetch_occurrence_data(self, occurrence_id): + response = occurrences.get(occurrence_id) + print(response) # Imprime la réponse complète pour inspecter la structure + if 'results' in response and len(response['results']) > 0: + return response['results'][0] + elif 'data' in response: + return response['data'] + elif len(response) > 0: + return response + else: + raise ValueError(f"Failed to fetch data for occurrence ID {occurrence_id}") + + def fetch_organization_data(self): + organization_key = self.data.get('publishingOrgKey') + print(organization_key) + if organization_key: + return registry.organizations(uuid=organization_key) + return {} + + def fetch_dataset_data(self): + dataset_key = self.data.get('datasetKey') + if dataset_key: + return registry.datasets(uuid=dataset_key) + return {} + + def fetch_species_data(self): + taxon_key = self.data.get('taxonKey') + print('fetch_species_data') + print(taxon_key) + if taxon_key: + return species.name_usage(key=taxon_key) + return {} + + + def fetch_subdivisions_data(self): + url = "https://api.gbif.org/v1/geocode/gadm/FRA.3_1/subdivisions" + response = requests.get(url) + response.raise_for_status() + return response.json() + + @property + def items(self): + return [self.data] # Return data as a list with a single item + + @property + def total(self): + return len(self.occurrence_ids) # Nombre total d'occurrences + + @property + def total(self): + return len(self.occurrence_ids) # Nombre total d'occurrences + + def get_geom(self, row): + if 'decimalLatitude' in row and 'decimalLongitude' in row: + point = f"POINT({row['decimalLongitude']} {row['decimalLatitude']})" + geom = wkt.loads(point) + return from_shape(geom, srid=4326) + return None + + def integrate_data(self): + integrated_data = self.data.copy() + integrated_data['organization'] = self.organization_data + integrated_data['dataset'] = self.dataset_data + integrated_data['species'] = self.species_data + integrated_data['cd_nom'] = self.cd_nom + # integrated_data['subdivisions'] = self.subdivisions_data + return integrated_data + + def next_row(self): + for occurrence_id in self.occurrence_ids: + self.occurrence_id = occurrence_id + self.data = self.fetch_occurrence_data(occurrence_id) + self.organization_data = self.fetch_organization_data() + self.dataset_data = self.fetch_dataset_data() + self.species_data = self.fetch_species_data() + self.subdivisions_data = self.fetch_subdivisions_data() + yield self.data + + # Surcouchage pour test + # def run(self, dry_run): + # if dry_run: + # print("Running in dry run mode") + # integrated_data = self.integrate_data() + + # print(integrated_data) + # print(self.total) + # for item in self.items: + # print(self.get_geom(item)) + # else: + # print("Running in normal mode") + +### Mapping a améliorer + mapping = { + # "unique_id_sinp" : "identifier", + "date_min": "eventDate", + "date_max": "eventDate", + "nom_cite": "scientificName", + "count_min": 1, + "count_max": 1, + "observers": "recordedBy", + "determiner": "recordedBy", + "meta_create_date": "eventDate", + "meta_update_date": "eventDate", + "place_name": "verbatimLocality", + "entity_source_pk_value": "catalogNumber", + } + dynamic_fields = { + # "unique_dataset_id" : "69f26484-08b6-4ccf-aeeb-42124d124fa1", # JDD test Inaturalist + # "id_dataset" : 705 + # "occurence_id" : "4407389321", + # "altitude_min": my_custom_func + + "cd_nom": fetch_taxref_cd_nom + } + diff --git a/api2gn/parsers.py b/api2gn/parsers.py index 24d6934..d90c245 100644 --- a/api2gn/parsers.py +++ b/api2gn/parsers.py @@ -162,6 +162,14 @@ def run(self, dry_run=False): class JSONParser(Parser): limit = 100 + def validate_maping(self): + """ + Validate the mapping throw the model (only Synthese model implemented) + """ + MappingValidator( + {**self.mapping, **self.constant_fields, **self.dynamic_fields} + ).validate() + def get_geom(self, row): """ Must return a wkb geom diff --git a/api2gn/var/config/parsers_gbif.py b/api2gn/var/config/parsers_gbif.py new file mode 100644 index 0000000..1145800 --- /dev/null +++ b/api2gn/var/config/parsers_gbif.py @@ -0,0 +1,74 @@ +from api2gn.parsers import WFSParser +from api2gn.geonature_parser import GeoNatureParser +from api2gn.gbif_parser import GBIFParser + +# Fichier a renommer en parsers.py pour fonctionner car parsers.py dans .gitignore +# Amélioration a faire, un fichier par parser + +### Exemple func +# def my_custom_func(value): +# """ +# Custom function to fill "observers" synthese with a depending api value +# """ +# if value == "Org 1": +# return "Observateur inconnu" +# else: +# return "Observateur 1" + + +class GBIFParserInaturalist(GBIFParser): + name = "GBIF_INaturalist" + description = "Le Parser GBIF_INaturalist permet de récupérer les données en provenance de INaturalist depuis la plateforme du GBIF. Vous pouvez mettre un JDD et une zone geographique, ou une liste d'identifiants" + # url = "" # pas nécessaire car usage de la lib pygbif + + limit = 100 # Limit du parser, mettre équivalent du limit de l'API + # filter api search occurences + # Mettre en commentaire pour utiliser la recherche par id_occurence + ## Vous pouvez remplacer les valeurs pour filtrer + api_filters = { + "datasetKey": "50c9509d-22c7-4a22-a47d-8c48425ef4a7", ## INaturalist research grade + "wkt" : "POLYGON((-5.3685 46.16181,-0.53236 46.16181,-0.53236 49.21621,-5.3685 49.21621,-5.3685 46.16181))", ## Polygon Bretagne -> get via url de https://www.gbif.org/occurrence/map? + "limit" : "100" , + } + + # Mettre en commentaire pour utiliser la recherche API + ## Vous pouvez remplacer par une liste dynamique d'IDs + # occurrence_ids = [4508012001] + # occurrence_ids = [4508012001, 4507897058, 4507948047, 4507718106, 4507942081] + + #--> La recherche par id est prioritaire sur la recherche par filtre + + ### Exemple dynamic_fields + # dynamic_fields = { + # # "unique_dataset_id" : "69f26484-08b6-4ccf-aeeb-42124d124fa1", # JDD test Inaturalist + # # "id_dataset" : 705 + # # # "occurence_id" : "4407389321", + # # "altitude_min": my_custom_func + # } + + # override existant GeoNatureParser mapping + # the key is the name of the column is synthese + # the value could be a str of the column in the API or a dict for a custom value + + mapping = { + # "unique_id_sinp": "xxx", + # "unique_id_sinp_grp": "xxx", + "date_min": "eventDate", + "date_max": "eventDate", + "nom_cite": "scientificName", + "observers": "recordedBy", + "determiner": "recordedBy", + "meta_create_date": "eventDate", + "meta_update_date": "eventDate", + "place_name": "verbatimLocality", + } + + # pass constant from missing value in my API + constant_fields = { + "id_source": 16, # a creer ou a récupérer depuis metadonnées + "id_dataset": 705, # Creer JDD test, a terme récupérer les métadonnées et creer JDD en auto + "count_min": 1, # Non disponible dans api + "count_max": 1, # Non disponible dans api + # "cd_nom": 4001, + + } diff --git a/requirements.in b/requirements.in index 047373f..9281f5d 100644 --- a/requirements.in +++ b/requirements.in @@ -3,4 +3,5 @@ requests marshmallow pygml geonature>2.12.0 -tqdm \ No newline at end of file +tqdm +pygbif \ No newline at end of file