Skip to content

Commit

Permalink
deposit: PDF metadata extraction
Browse files Browse the repository at this point in the history
* Not extracts metadata on upload anymore.
* Provides a REST endpoint for extracting metadata.

Co-Authored-by: Sébastien Délèze <sebastien.deleze@rero.ch>
  • Loading branch information
Sébastien Délèze committed Feb 21, 2020
1 parent de0aaf8 commit 822a521
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 11 deletions.
37 changes: 27 additions & 10 deletions sonar/modules/deposits/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,6 @@ def post(pid=None):
deposit.files[key]['category'] = request.args['type']
deposit.files[key]['file_type'] = 'file'

# Extract data from pdf and populate deposit
if request.args['type'] == 'main':
pdf_extractor = PDFExtractor()
pdf_metadata = format_extracted_data(
pdf_extractor.process_raw(request.get_data()))

# deposit.populate_with_pdf_metadata(
# pdf_metadata, "Deposit #{pid}".format(pid=pid))
deposit.files[key]['pdf_metadata'] = pdf_metadata

deposit.commit()
db.session.commit()

Expand Down Expand Up @@ -230,3 +220,30 @@ def review(pid=None):
db.session.commit()

return make_response(jsonify(deposit))


@blueprint.route('/extract-pdf-metadata', methods=['GET'])
def extract_metadata(pid=None):
"""Publish a deposit or send a message for review."""
deposit = DepositRecord.get_record_by_pid(pid)

if not deposit:
abort(400)

main_file = [
file for file in deposit.files if file['category'] == 'main' and
file.mimetype == 'application/pdf'
]

if not main_file:
abort(500)

# Get file content
with main_file[0].file.storage().open() as pdf_file:
content = pdf_file.read()

# Extract data from pdf
pdf_extractor = PDFExtractor()
pdf_metadata = format_extracted_data(pdf_extractor.process_raw(content))

return make_response(jsonify(pdf_metadata))
26 changes: 26 additions & 0 deletions tests/api/deposits/test_deposits_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,29 @@ def test_review(client, db, db_user_fixture, db_moderator_fixture,
}),
headers=headers)
assert response.status_code == 200


def test_extract_metadata(app, client, deposit_fixture):
"""Test PDF metadata extraction."""
url = 'https://localhost:5000/deposits/{pid}/extract-pdf-metadata'.format(
pid=deposit_fixture['pid'])

headers = {
'Content-Type': 'application/json',
'Accept': 'application/json'
}

response = client.get(url, headers=headers)
assert response.status_code == 200
assert response.json[
'title'] == 'High-harmonic generation in quantum spin systems'

deposit_fixture.files['main.pdf'].remove()
response = client.get(url, headers=headers)
assert response.status_code == 500

response = client.get(
'https://localhost:5000/deposits/{pid}/extract-pdf-metadata'.format(
pid='not-existing'),
headers=headers)
assert response.status_code == 400
3 changes: 2 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,8 +444,9 @@ def deposit_fixture(app, db, db_user_fixture, pdf_file,
deposit.files['additional.pdf']['category'] = 'additional'
deposit.files['additional.pdf']['file_type'] = 'file'

db.session.commit()
deposit.commit()
deposit.reindex()
db.session.commit()

current_search.flush_and_refresh('deposits')

Expand Down

0 comments on commit 822a521

Please sign in to comment.