Skip to content

Commit

Permalink
Merge pull request #394 from ror-community/dev
Browse files Browse the repository at this point in the history
Merge dev to staging: Update index ror dump commands
  • Loading branch information
lizkrznarich authored Apr 18, 2024
2 parents 3f9f87a + 7c409d5 commit cd6ac78
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 44 deletions.
64 changes: 64 additions & 0 deletions rorapi/management/commands/getrordump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import json
import os
import re
import requests
import zipfile
import base64
from io import BytesIO
from rorapi.settings import ES7, ES_VARS, ROR_DUMP, DATA
from django.core.management.base import BaseCommand

HEADERS = {'Accept': 'application/vnd.github.v3+json'}
AUTH_HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'}

def get_ror_dump_sha(filename, use_test_data, github_headers):
sha = ''
if use_test_data:
contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents'
else:
contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents'
try:
response = requests.get(contents_url, headers=github_headers)
except requests.exceptions.RequestException as e:
raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}")
try:
repo_contents = response.json()
for file in repo_contents:
if filename in file['name']:
sha = file['sha']
return sha
except:
return None

def get_ror_dump_zip(filename, use_test_data, github_headers):
sha = get_ror_dump_sha(filename, use_test_data, github_headers)
if sha:
if use_test_data:
blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha
else:
blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha
try:
response = requests.get(blob_url, headers=github_headers)
except requests.exceptions.RequestException as e:
raise SystemExit(f"Github blob is Not reachable \nErr: {e}")
try:
response_json = response.json()
file_decoded = base64.b64decode(response_json['content'])
with open(filename + '.zip', 'wb') as zip_file:
zip_file.write(file_decoded)
return zip_file.name
except:
return None

class Command(BaseCommand):
help = 'Downloads a specified ROR data dump from Github'

def handle(self, *args, **options):
filename = options['filename']
use_test_data = options['testdata']
self.stdout.write('Getting ROR dump')
if ROR_DUMP['GITHUB_TOKEN']:
github_headers = AUTH_HEADERS
else:
github_headers = HEADERS
ror_dump_zip = get_ror_dump_zip(filename, use_test_data, github_headers)
52 changes: 9 additions & 43 deletions rorapi/management/commands/indexrordump.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,56 +96,18 @@ def index_dump(self, filename, index, dataset):
ES7.indices.delete(backup_index)
self.stdout.write('ROR dataset ' + filename + ' indexed')

def get_ror_dump_sha(filename, use_test_data):
sha = ''
if use_test_data:
contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents'
else:
contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents'
try:
response = requests.get(contents_url, headers=HEADERS)
except requests.exceptions.RequestException as e:
raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}")
try:
repo_contents = response.json()
for file in repo_contents:
if filename in file['name']:
sha = file['sha']
return sha
except:
return None

def get_ror_dump_zip(filename, use_test_data):
sha = get_ror_dump_sha(filename, use_test_data)
if sha:
if use_test_data:
blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha
else:
blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha
try:
response = requests.get(blob_url, headers=HEADERS)
except requests.exceptions.RequestException as e:
raise SystemExit(f"Github blob is Not reachable \nErr: {e}")
try:
response_json = response.json()
file_decoded = base64.b64decode(response_json['content'])
with open(filename + '.zip', 'wb') as zip_file:
zip_file.write(file_decoded)
return zip_file.name
except:
return None

class Command(BaseCommand):
help = 'Indexes ROR dataset from a full dump file in ror-data repo'

def handle(self, *args, **options):
json_files = []
filename = options['filename']
use_test_data = options['testdata']
ror_dump_zip = get_ror_dump_zip(filename, use_test_data)
if ror_dump_zip:
ror_dump_zip = filename + '.zip'
if os.path.exists(ror_dump_zip):
if not os.path.exists(DATA['WORKING_DIR']):
os.makedirs(DATA['WORKING_DIR'])
self.stdout.write('Extracting ROR dump')
with zipfile.ZipFile(ror_dump_zip, 'r') as zip_ref:
zip_ref.extractall(DATA['WORKING_DIR'] + filename)
unzipped_files = os.listdir(DATA['WORKING_DIR'] + filename)
Expand All @@ -155,13 +117,17 @@ def handle(self, *args, **options):
for json_file in json_files:
index = None
json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file
with open(json_path, 'r') as it:
dataset = json.load(it)
if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None):
self.stdout.write('Loading JSON')
with open(json_path, 'r') as it:
dataset = json.load(it)
self.stdout.write('Indexing ROR dataset ' + json_file)
index = ES_VARS['INDEX_V2']
index_dump(self, json_file, index, dataset)
if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None):
self.stdout.write('Loading JSON')
with open(json_path, 'r') as it:
dataset = json.load(it)
self.stdout.write('Indexing ROR dataset ' + json_file)
index = ES_VARS['INDEX_V1']
index_dump(self, json_file, index, dataset)
Expand Down
6 changes: 5 additions & 1 deletion rorapi/management/commands/setup.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import requests
import zipfile

import base64
from django.core.management.base import BaseCommand
from rorapi.management.commands.deleteindex import Command as DeleteIndexCommand
from rorapi.management.commands.createindex import Command as CreateIndexCommand
from rorapi.management.commands.indexrordump import Command as IndexRorDumpCommand
from rorapi.management.commands.getrordump import Command as GetRorDumpCommand
from rorapi.settings import ROR_DUMP

HEADERS = {'Accept': 'application/vnd.github.v3+json'}

HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'}

def get_ror_dump_sha(filename, use_test_data):
sha = ''
if use_test_data:
Expand Down Expand Up @@ -49,6 +52,7 @@ def handle(self, *args, **options):
sha = get_ror_dump_sha(filename, use_test_data)

if sha:
GetRorDumpCommand().handle(*args, **options)
DeleteIndexCommand().handle(*args, **options)
CreateIndexCommand().handle(*args, **options)
IndexRorDumpCommand().handle(*args, **options)
Expand Down

0 comments on commit cd6ac78

Please sign in to comment.