Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge dev to staging: Update index ror dump commands #394

Merged
merged 4 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions rorapi/management/commands/getrordump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import json
import os
import re
import requests
import zipfile
import base64
from io import BytesIO
from rorapi.settings import ES7, ES_VARS, ROR_DUMP, DATA
from django.core.management.base import BaseCommand

HEADERS = {'Accept': 'application/vnd.github.v3+json'}
AUTH_HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'}

def get_ror_dump_sha(filename, use_test_data, github_headers):
sha = ''
if use_test_data:
contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents'
else:
contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents'
try:
response = requests.get(contents_url, headers=github_headers)
except requests.exceptions.RequestException as e:
raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}")
try:
repo_contents = response.json()
for file in repo_contents:
if filename in file['name']:
sha = file['sha']
return sha
except:
return None

def get_ror_dump_zip(filename, use_test_data, github_headers):
sha = get_ror_dump_sha(filename, use_test_data, github_headers)
if sha:
if use_test_data:
blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha
else:
blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha
try:
response = requests.get(blob_url, headers=github_headers)
except requests.exceptions.RequestException as e:
raise SystemExit(f"Github blob is Not reachable \nErr: {e}")
try:
response_json = response.json()
file_decoded = base64.b64decode(response_json['content'])
with open(filename + '.zip', 'wb') as zip_file:
zip_file.write(file_decoded)
return zip_file.name
except:
return None

class Command(BaseCommand):
help = 'Downloads a specified ROR data dump from Github'

def handle(self, *args, **options):
filename = options['filename']
use_test_data = options['testdata']
self.stdout.write('Getting ROR dump')
if ROR_DUMP['GITHUB_TOKEN']:
github_headers = AUTH_HEADERS
else:
github_headers = HEADERS
ror_dump_zip = get_ror_dump_zip(filename, use_test_data, github_headers)
52 changes: 9 additions & 43 deletions rorapi/management/commands/indexrordump.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,56 +96,18 @@ def index_dump(self, filename, index, dataset):
ES7.indices.delete(backup_index)
self.stdout.write('ROR dataset ' + filename + ' indexed')

def get_ror_dump_sha(filename, use_test_data):
sha = ''
if use_test_data:
contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents'
else:
contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents'
try:
response = requests.get(contents_url, headers=HEADERS)
except requests.exceptions.RequestException as e:
raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}")
try:
repo_contents = response.json()
for file in repo_contents:
if filename in file['name']:
sha = file['sha']
return sha
except:
return None

def get_ror_dump_zip(filename, use_test_data):
sha = get_ror_dump_sha(filename, use_test_data)
if sha:
if use_test_data:
blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha
else:
blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha
try:
response = requests.get(blob_url, headers=HEADERS)
except requests.exceptions.RequestException as e:
raise SystemExit(f"Github blob is Not reachable \nErr: {e}")
try:
response_json = response.json()
file_decoded = base64.b64decode(response_json['content'])
with open(filename + '.zip', 'wb') as zip_file:
zip_file.write(file_decoded)
return zip_file.name
except:
return None

class Command(BaseCommand):
help = 'Indexes ROR dataset from a full dump file in ror-data repo'

def handle(self, *args, **options):
json_files = []
filename = options['filename']
use_test_data = options['testdata']
ror_dump_zip = get_ror_dump_zip(filename, use_test_data)
if ror_dump_zip:
ror_dump_zip = filename + '.zip'
if os.path.exists(ror_dump_zip):
if not os.path.exists(DATA['WORKING_DIR']):
os.makedirs(DATA['WORKING_DIR'])
self.stdout.write('Extracting ROR dump')
with zipfile.ZipFile(ror_dump_zip, 'r') as zip_ref:
zip_ref.extractall(DATA['WORKING_DIR'] + filename)
unzipped_files = os.listdir(DATA['WORKING_DIR'] + filename)
Expand All @@ -155,13 +117,17 @@ def handle(self, *args, **options):
for json_file in json_files:
index = None
json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file
with open(json_path, 'r') as it:
dataset = json.load(it)
if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None):
self.stdout.write('Loading JSON')
with open(json_path, 'r') as it:
dataset = json.load(it)
self.stdout.write('Indexing ROR dataset ' + json_file)
index = ES_VARS['INDEX_V2']
index_dump(self, json_file, index, dataset)
if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None):
self.stdout.write('Loading JSON')
with open(json_path, 'r') as it:
dataset = json.load(it)
self.stdout.write('Indexing ROR dataset ' + json_file)
index = ES_VARS['INDEX_V1']
index_dump(self, json_file, index, dataset)
Expand Down
6 changes: 5 additions & 1 deletion rorapi/management/commands/setup.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import requests
import zipfile

import base64
from django.core.management.base import BaseCommand
from rorapi.management.commands.deleteindex import Command as DeleteIndexCommand
from rorapi.management.commands.createindex import Command as CreateIndexCommand
from rorapi.management.commands.indexrordump import Command as IndexRorDumpCommand
from rorapi.management.commands.getrordump import Command as GetRorDumpCommand
from rorapi.settings import ROR_DUMP

HEADERS = {'Accept': 'application/vnd.github.v3+json'}

HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'}

def get_ror_dump_sha(filename, use_test_data):
sha = ''
if use_test_data:
Expand Down Expand Up @@ -49,6 +52,7 @@ def handle(self, *args, **options):
sha = get_ror_dump_sha(filename, use_test_data)

if sha:
GetRorDumpCommand().handle(*args, **options)
DeleteIndexCommand().handle(*args, **options)
CreateIndexCommand().handle(*args, **options)
IndexRorDumpCommand().handle(*args, **options)
Expand Down
Loading