From 41046939eeec99f10bbb3c3825c6e9679db85971 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Thu, 18 Apr 2024 11:35:55 -0500 Subject: [PATCH 1/2] parse JSON separately when indexing full dump --- rorapi/management/commands/indexrordump.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/rorapi/management/commands/indexrordump.py b/rorapi/management/commands/indexrordump.py index 10b6704..5c43c20 100644 --- a/rorapi/management/commands/indexrordump.py +++ b/rorapi/management/commands/indexrordump.py @@ -142,26 +142,33 @@ def handle(self, *args, **options): json_files = [] filename = options['filename'] use_test_data = options['testdata'] + self.stdout.write('Getting ROR dump') ror_dump_zip = get_ror_dump_zip(filename, use_test_data) if ror_dump_zip: if not os.path.exists(DATA['WORKING_DIR']): os.makedirs(DATA['WORKING_DIR']) + self.stdout.write('Extracting ROR dump') with zipfile.ZipFile(ror_dump_zip, 'r') as zip_ref: zip_ref.extractall(DATA['WORKING_DIR'] + filename) unzipped_files = os.listdir(DATA['WORKING_DIR'] + filename) for file in unzipped_files: if file.endswith(".json"): json_files.append(file) + for json_file in json_files: index = None json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file - with open(json_path, 'r') as it: - dataset = json.load(it) if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None): + self.stdout.write('Loading JSON') + with open(json_path, 'r') as it: + dataset = json.load(it) self.stdout.write('Indexing ROR dataset ' + json_file) index = ES_VARS['INDEX_V2'] index_dump(self, json_file, index, dataset) if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None): + self.stdout.write('Loading JSON') + with open(json_path, 'r') as it: + dataset = json.load(it) self.stdout.write('Indexing ROR dataset ' + json_file) index = ES_VARS['INDEX_V1'] index_dump(self, json_file, index, dataset) From fb9211d79f598a75ed821158e1ba0d7dfe0f4a56 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Thu, 18 Apr 2024 14:36:19 -0500 Subject: [PATCH 2/2] add new command for fetching dump file --- rorapi/management/commands/getrordump.py | 64 ++++++++++++++++++++++ rorapi/management/commands/indexrordump.py | 45 +-------------- rorapi/management/commands/setup.py | 6 +- 3 files changed, 71 insertions(+), 44 deletions(-) create mode 100644 rorapi/management/commands/getrordump.py diff --git a/rorapi/management/commands/getrordump.py b/rorapi/management/commands/getrordump.py new file mode 100644 index 0000000..4bee42d --- /dev/null +++ b/rorapi/management/commands/getrordump.py @@ -0,0 +1,64 @@ +import json +import os +import re +import requests +import zipfile +import base64 +from io import BytesIO +from rorapi.settings import ES7, ES_VARS, ROR_DUMP, DATA +from django.core.management.base import BaseCommand + +HEADERS = {'Accept': 'application/vnd.github.v3+json'} +AUTH_HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'} + +def get_ror_dump_sha(filename, use_test_data, github_headers): + sha = '' + if use_test_data: + contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents' + else: + contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents' + try: + response = requests.get(contents_url, headers=github_headers) + except requests.exceptions.RequestException as e: + raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}") + try: + repo_contents = response.json() + for file in repo_contents: + if filename in file['name']: + sha = file['sha'] + return sha + except: + return None + +def get_ror_dump_zip(filename, use_test_data, github_headers): + sha = get_ror_dump_sha(filename, use_test_data, github_headers) + if sha: + if use_test_data: + blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha + else: + blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha + try: + response = requests.get(blob_url, headers=github_headers) + except requests.exceptions.RequestException as e: + raise SystemExit(f"Github blob is Not reachable \nErr: {e}") + try: + response_json = response.json() + file_decoded = base64.b64decode(response_json['content']) + with open(filename + '.zip', 'wb') as zip_file: + zip_file.write(file_decoded) + return zip_file.name + except: + return None + +class Command(BaseCommand): + help = 'Downloads a specified ROR data dump from Github' + + def handle(self, *args, **options): + filename = options['filename'] + use_test_data = options['testdata'] + self.stdout.write('Getting ROR dump') + if ROR_DUMP['GITHUB_TOKEN']: + github_headers = AUTH_HEADERS + else: + github_headers = HEADERS + ror_dump_zip = get_ror_dump_zip(filename, use_test_data, github_headers) diff --git a/rorapi/management/commands/indexrordump.py b/rorapi/management/commands/indexrordump.py index 5c43c20..f1ce85a 100644 --- a/rorapi/management/commands/indexrordump.py +++ b/rorapi/management/commands/indexrordump.py @@ -96,44 +96,6 @@ def index_dump(self, filename, index, dataset): ES7.indices.delete(backup_index) self.stdout.write('ROR dataset ' + filename + ' indexed') -def get_ror_dump_sha(filename, use_test_data): - sha = '' - if use_test_data: - contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents' - else: - contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents' - try: - response = requests.get(contents_url, headers=HEADERS) - except requests.exceptions.RequestException as e: - raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}") - try: - repo_contents = response.json() - for file in repo_contents: - if filename in file['name']: - sha = file['sha'] - return sha - except: - return None - -def get_ror_dump_zip(filename, use_test_data): - sha = get_ror_dump_sha(filename, use_test_data) - if sha: - if use_test_data: - blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha - else: - blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha - try: - response = requests.get(blob_url, headers=HEADERS) - except requests.exceptions.RequestException as e: - raise SystemExit(f"Github blob is Not reachable \nErr: {e}") - try: - response_json = response.json() - file_decoded = base64.b64decode(response_json['content']) - with open(filename + '.zip', 'wb') as zip_file: - zip_file.write(file_decoded) - return zip_file.name - except: - return None class Command(BaseCommand): help = 'Indexes ROR dataset from a full dump file in ror-data repo' @@ -141,10 +103,8 @@ class Command(BaseCommand): def handle(self, *args, **options): json_files = [] filename = options['filename'] - use_test_data = options['testdata'] - self.stdout.write('Getting ROR dump') - ror_dump_zip = get_ror_dump_zip(filename, use_test_data) - if ror_dump_zip: + ror_dump_zip = filename + '.zip' + if os.path.exists(ror_dump_zip): if not os.path.exists(DATA['WORKING_DIR']): os.makedirs(DATA['WORKING_DIR']) self.stdout.write('Extracting ROR dump') @@ -154,7 +114,6 @@ def handle(self, *args, **options): for file in unzipped_files: if file.endswith(".json"): json_files.append(file) - for json_file in json_files: index = None json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file diff --git a/rorapi/management/commands/setup.py b/rorapi/management/commands/setup.py index bf7e5fe..3505e72 100644 --- a/rorapi/management/commands/setup.py +++ b/rorapi/management/commands/setup.py @@ -1,14 +1,17 @@ import requests import zipfile - +import base64 from django.core.management.base import BaseCommand from rorapi.management.commands.deleteindex import Command as DeleteIndexCommand from rorapi.management.commands.createindex import Command as CreateIndexCommand from rorapi.management.commands.indexrordump import Command as IndexRorDumpCommand +from rorapi.management.commands.getrordump import Command as GetRorDumpCommand from rorapi.settings import ROR_DUMP HEADERS = {'Accept': 'application/vnd.github.v3+json'} +HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'} + def get_ror_dump_sha(filename, use_test_data): sha = '' if use_test_data: @@ -49,6 +52,7 @@ def handle(self, *args, **options): sha = get_ror_dump_sha(filename, use_test_data) if sha: + GetRorDumpCommand().handle(*args, **options) DeleteIndexCommand().handle(*args, **options) CreateIndexCommand().handle(*args, **options) IndexRorDumpCommand().handle(*args, **options)