diff --git a/rorapi/management/commands/getrordump.py b/rorapi/management/commands/getrordump.py new file mode 100644 index 0000000..4bee42d --- /dev/null +++ b/rorapi/management/commands/getrordump.py @@ -0,0 +1,64 @@ +import json +import os +import re +import requests +import zipfile +import base64 +from io import BytesIO +from rorapi.settings import ES7, ES_VARS, ROR_DUMP, DATA +from django.core.management.base import BaseCommand + +HEADERS = {'Accept': 'application/vnd.github.v3+json'} +AUTH_HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'} + +def get_ror_dump_sha(filename, use_test_data, github_headers): + sha = '' + if use_test_data: + contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents' + else: + contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents' + try: + response = requests.get(contents_url, headers=github_headers) + except requests.exceptions.RequestException as e: + raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}") + try: + repo_contents = response.json() + for file in repo_contents: + if filename in file['name']: + sha = file['sha'] + return sha + except: + return None + +def get_ror_dump_zip(filename, use_test_data, github_headers): + sha = get_ror_dump_sha(filename, use_test_data, github_headers) + if sha: + if use_test_data: + blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha + else: + blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha + try: + response = requests.get(blob_url, headers=github_headers) + except requests.exceptions.RequestException as e: + raise SystemExit(f"Github blob is Not reachable \nErr: {e}") + try: + response_json = response.json() + file_decoded = base64.b64decode(response_json['content']) + with open(filename + '.zip', 'wb') as zip_file: + zip_file.write(file_decoded) + return zip_file.name + except: + return None + +class Command(BaseCommand): + help = 'Downloads a specified ROR data dump from Github' + + def handle(self, *args, **options): + filename = options['filename'] + use_test_data = options['testdata'] + self.stdout.write('Getting ROR dump') + if ROR_DUMP['GITHUB_TOKEN']: + github_headers = AUTH_HEADERS + else: + github_headers = HEADERS + ror_dump_zip = get_ror_dump_zip(filename, use_test_data, github_headers) diff --git a/rorapi/management/commands/indexrordump.py b/rorapi/management/commands/indexrordump.py index 10b6704..f1ce85a 100644 --- a/rorapi/management/commands/indexrordump.py +++ b/rorapi/management/commands/indexrordump.py @@ -96,44 +96,6 @@ def index_dump(self, filename, index, dataset): ES7.indices.delete(backup_index) self.stdout.write('ROR dataset ' + filename + ' indexed') -def get_ror_dump_sha(filename, use_test_data): - sha = '' - if use_test_data: - contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents' - else: - contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents' - try: - response = requests.get(contents_url, headers=HEADERS) - except requests.exceptions.RequestException as e: - raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}") - try: - repo_contents = response.json() - for file in repo_contents: - if filename in file['name']: - sha = file['sha'] - return sha - except: - return None - -def get_ror_dump_zip(filename, use_test_data): - sha = get_ror_dump_sha(filename, use_test_data) - if sha: - if use_test_data: - blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha - else: - blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha - try: - response = requests.get(blob_url, headers=HEADERS) - except requests.exceptions.RequestException as e: - raise SystemExit(f"Github blob is Not reachable \nErr: {e}") - try: - response_json = response.json() - file_decoded = base64.b64decode(response_json['content']) - with open(filename + '.zip', 'wb') as zip_file: - zip_file.write(file_decoded) - return zip_file.name - except: - return None class Command(BaseCommand): help = 'Indexes ROR dataset from a full dump file in ror-data repo' @@ -141,11 +103,11 @@ class Command(BaseCommand): def handle(self, *args, **options): json_files = [] filename = options['filename'] - use_test_data = options['testdata'] - ror_dump_zip = get_ror_dump_zip(filename, use_test_data) - if ror_dump_zip: + ror_dump_zip = filename + '.zip' + if os.path.exists(ror_dump_zip): if not os.path.exists(DATA['WORKING_DIR']): os.makedirs(DATA['WORKING_DIR']) + self.stdout.write('Extracting ROR dump') with zipfile.ZipFile(ror_dump_zip, 'r') as zip_ref: zip_ref.extractall(DATA['WORKING_DIR'] + filename) unzipped_files = os.listdir(DATA['WORKING_DIR'] + filename) @@ -155,13 +117,17 @@ def handle(self, *args, **options): for json_file in json_files: index = None json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file - with open(json_path, 'r') as it: - dataset = json.load(it) if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None): + self.stdout.write('Loading JSON') + with open(json_path, 'r') as it: + dataset = json.load(it) self.stdout.write('Indexing ROR dataset ' + json_file) index = ES_VARS['INDEX_V2'] index_dump(self, json_file, index, dataset) if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None): + self.stdout.write('Loading JSON') + with open(json_path, 'r') as it: + dataset = json.load(it) self.stdout.write('Indexing ROR dataset ' + json_file) index = ES_VARS['INDEX_V1'] index_dump(self, json_file, index, dataset) diff --git a/rorapi/management/commands/setup.py b/rorapi/management/commands/setup.py index bf7e5fe..3505e72 100644 --- a/rorapi/management/commands/setup.py +++ b/rorapi/management/commands/setup.py @@ -1,14 +1,17 @@ import requests import zipfile - +import base64 from django.core.management.base import BaseCommand from rorapi.management.commands.deleteindex import Command as DeleteIndexCommand from rorapi.management.commands.createindex import Command as CreateIndexCommand from rorapi.management.commands.indexrordump import Command as IndexRorDumpCommand +from rorapi.management.commands.getrordump import Command as GetRorDumpCommand from rorapi.settings import ROR_DUMP HEADERS = {'Accept': 'application/vnd.github.v3+json'} +HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'} + def get_ror_dump_sha(filename, use_test_data): sha = '' if use_test_data: @@ -49,6 +52,7 @@ def handle(self, *args, **options): sha = get_ror_dump_sha(filename, use_test_data) if sha: + GetRorDumpCommand().handle(*args, **options) DeleteIndexCommand().handle(*args, **options) CreateIndexCommand().handle(*args, **options) IndexRorDumpCommand().handle(*args, **options)