Skip to content

Commit

Permalink
Build ROR importer #3168
Browse files Browse the repository at this point in the history
  • Loading branch information
joemull committed Dec 11, 2024
1 parent f46ab9a commit 86f02e2
Show file tree
Hide file tree
Showing 7 changed files with 404 additions and 4 deletions.
47 changes: 47 additions & 0 deletions src/core/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,50 @@ class AccessRequestAdmin(admin.ModelAdmin):
date_hierarchy = ('requested')


class OrganizationAdmin(admin.ModelAdmin):
list_display = ('pk', 'ror', '_ror_display', '_custom_label',
'_locations', 'ror_status')
list_display_links = ('pk', 'ror')
list_filter = ('ror_status', 'locations__country')
search_fields = ('pk', 'ror_display__value', 'custom_label__value', 'labels__value',
'aliases__value', 'acronyms__value')
raw_id_fields = ('locations', )

def _ror_display(self, obj):
return obj.ror_display if obj and obj.ror_display else ''

def _locations(self, obj):
return '; '.join([str(l) for l in obj.locations.all()]) if obj else ''

def _custom_label(self, obj):
return obj.custom_label if obj and obj.custom_label else ''


class OrganizationNameAdmin(admin.ModelAdmin):
list_display = ('pk', 'value', 'language')
list_display_links = ('pk', 'value')
search_fields = ('pk', 'value')
raw_id_fields = ('ror_display_for', 'custom_label_for',
'label_for', 'alias_for', 'acronym_for')

def _ror_display(self, obj):
return obj.ror_display if obj and obj.ror_display else ''

def _locations(self, obj):
return '; '.join([str(l) for l in obj.locations.all()]) if obj else ''

def _custom_label(self, obj):
return obj.custom_label if obj and obj.custom_label else ''


class LocationAdmin(admin.ModelAdmin):
list_display = ('pk', 'name', 'country', 'geonames_id')
list_display_links = ('pk', 'name')
list_filter = ('country',)
search_fields = ('pk', 'name', 'country__code', 'country__name',
'geonames_id')


admin_list = [
(models.AccountRole, AccountRoleAdmin),
(models.Account, AccountAdmin),
Expand Down Expand Up @@ -427,6 +471,9 @@ class AccessRequestAdmin(admin.ModelAdmin):
(models.Contacts, ContactsAdmin),
(models.Contact, ContactAdmin),
(models.AccessRequest, AccessRequestAdmin),
(models.Organization, OrganizationAdmin),
(models.OrganizationName, OrganizationNameAdmin),
(models.Location, LocationAdmin),
]

[admin.site.register(*t) for t in admin_list]
80 changes: 80 additions & 0 deletions src/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
__license__ = "AGPL v3"
__maintainer__ = "Birkbeck Centre for Technology and Publishing"

from decimal import Decimal
import os
import re
import uuid
Expand All @@ -13,6 +14,8 @@
import pytz
from hijack.signals import hijack_started, hijack_ended
import warnings
import tqdm
import zipfile

from bs4 import BeautifulSoup
from django.conf import settings
Expand Down Expand Up @@ -51,6 +54,7 @@
from review import models as review_models
from copyediting import models as copyediting_models
from repository import models as repository_models
from utils.models import RORImportError
from submission import models as submission_models
from utils.logger import get_logger
from utils import logic as utils_logic
Expand Down Expand Up @@ -2080,6 +2084,82 @@ def naive_get_or_create(

return organization, created

@classmethod
def create_from_ror_record(cls, record):
"""
Creates one organization object in Janeway from a ROR JSON record,
using version 2 of the ROR Schema.
See https://ror.readme.io/v2/docs/data-structure
"""
organization, created = cls.objects.get_or_create(
ror=record.get('id', ''),
)
if record.get('status'):
organization.ror_status = record.get('status')
organization.save()
for name in record.get('names'):
kwargs = {}
kwargs['value'] = name.get('value', '')
if name.get('lang'):
kwargs['language'] = name.get('language', '')
if 'ror_display' in name.get('types'):
kwargs['ror_display_for'] = organization
if 'label' in name.get('types'):
kwargs['label_for'] = organization
if 'alias' in name.get('types'):
kwargs['alias_for'] = organization
if 'acronym' in name.get('types'):
kwargs['acronym_for'] = organization
OrganizationName.objects.get_or_create(**kwargs)
for location in record.get('locations'):
details = location.get('geonames_details', {})
country, created = Country.objects.get_or_create(
code=details.get('country_code', ''),
)
lat = Decimal(details.get('lat'))
lng = Decimal(details.get('lng'))
location, created = Location.objects.get_or_create(
name=details.get('name', ''),
country=country,
latitude=lat,
longitude=lng,
geonames_id=location.get('geonames_id'),
)
organization.locations.add(location)


@classmethod
def import_ror_batch(cls, ror_import, test_full_import=False):
"""
Opens a previously downloaded data dump from
ROR's Zenodo endpoint, processes the records,
and records errors for exceptions raised during creation.
https://ror.readme.io/v2/docs/data-dump
"""
num_errors_before = RORImportError.objects.count()
with zipfile.ZipFile(ror_import.zip_path, mode='r') as zip_ref:
for file_info in zip_ref.infolist():
if file_info.filename.endswith('v2.json'):
json_string = zip_ref.read(file_info).decode(encoding="utf-8")
data = json.loads(json_string)
if settings.DEBUG and not test_full_import:
# Limit the import run during development by default
data = data[:100]
for item in tqdm.tqdm(data):
try:
cls.create_from_ror_record(item)
except Exception as error:
message = f'{error}\n{json.dumps(item)}'
RORImportError.objects.create(
ror_import=ror_import,
message=message,
)
num_errors_after = RORImportError.objects.count()
if num_errors_after > num_errors_before:
logger.warn(
f'ROR import errors logged: { num_errors_after - num_errors_before }'
)


class Affiliation(models.Model):
account = models.ForeignKey(
Expand Down
25 changes: 24 additions & 1 deletion src/utils/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,34 @@ class VersionAdmin(admin.ModelAdmin):
date_hierarchy = ('date')


class RORImportAdmin(admin.ModelAdmin):
list_display = ('pk', 'status', 'started', 'stopped')
list_filter = ('status', 'started', 'stopped')
search_fields = ('rorimporterror__message', 'records',)
date_hierarchy = ('started')
readonly_fields = ('started', 'stopped', 'status', 'records')
inlines = [
admin_utils.RORImportErrorInline,
]


class RORImportErrorAdmin(admin.ModelAdmin):
list_display = ('pk', '_first_line')
search_fields = ('message',)
date_hierarchy = ('ror_import__started')
raw_id_fields = ('ror_import', )

def _first_line(self, obj):
return obj.message.split('\n')[0] if obj and obj.message else ''


admin_list = [
(models.LogEntry, LogAdmin),
(models.Plugin, PluginAdmin),
(models.ImportCacheEntry, ImportCacheAdmin),
(models.Version, VersionAdmin)
(models.Version, VersionAdmin),
(models.RORImport, RORImportAdmin),
(models.RORImportError, RORImportErrorAdmin),
]

[admin.site.register(*t) for t in admin_list]
6 changes: 6 additions & 0 deletions src/utils/admin_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,12 @@ class NewsItemInline(admin.TabularInline):
raw_id_fields = ('newsitem',)


class RORImportErrorInline(admin.TabularInline):
model = core_models.RORImportError
extra = 0
readonly_fields = ('message',)


class JournalFilterBase(admin.SimpleListFilter):
"""
A base class for other journal filters
Expand Down
53 changes: 53 additions & 0 deletions src/utils/management/commands/import_ror_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from django.conf import settings
from django.core.management.base import BaseCommand

from utils.models import RORImport
from core.models import Organization
from utils.logger import get_logger


logger = get_logger(__name__)


class Command(BaseCommand):
"""
Fetches ROR data and generates Organization records.
"""

help = "Fetches ROR data and generates Organization records."

def add_arguments(self, parser):
parser.add_argument(
'--test_full_import',
help='By default, the command only runs 100 records when DEBUG=True.'
'Pass --test_full_import to import the entire dump in development.',
action='store_true',
)
return super().add_arguments(parser)

def handle(self, *args, **options):
ror_import = RORImport.objects.create()
ror_import.get_records()

# The import is necessary.
# Check we have the right copy of the data dump.
if ror_import.ongoing or settings.DEBUG:
if not ror_import.previous_import:
ror_import.download_data()
elif ror_import.previous_import.zip_path != ror_import.zip_path:
ror_import.download_data()

# The data is all downloaded and ready to import.
if ror_import.ongoing or settings.DEBUG:
test_full_import = options.get('test_full_import', False)
Organization.import_ror_batch(
ror_import,
test_full_import=test_full_import,
)

# The process did not error out, so it can be considered a success.
if ror_import.ongoing:
ror_import.status = ror_import.RORImportStatus.SUCCESSFUL
ror_import.save()

logger.info(ror_import.status)
37 changes: 37 additions & 0 deletions src/utils/migrations/0035_rorimport_rorimporterror.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Generated by Django 4.2.14 on 2024-07-26 20:51

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('utils', '0034_rename_toaddress_addressee'),
]

operations = [
migrations.CreateModel(
name='RORImport',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('started', models.DateTimeField(auto_now_add=True)),
('stopped', models.DateTimeField(blank=True, null=True)),
('status', models.CharField(choices=[('ongoing', 'Ongoing'), ('unnecessary', 'Unnecessary'), ('successful', 'Successful'), ('failed', 'Failed')], default='ongoing')),
('records', models.JSONField(default=dict)),
],
options={
'verbose_name': 'ROR import',
'verbose_name_plural': 'ROR imports',
'get_latest_by': 'started',
},
),
migrations.CreateModel(
name='RORImportError',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('message', models.TextField(blank=True)),
('ror_import', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='utils.rorimport')),
],
),
]
Loading

0 comments on commit 86f02e2

Please sign in to comment.