Skip to content

Commit

Permalink
Merge pull request #82 from golnazads/master
Browse files Browse the repository at this point in the history
fixed a bug, eprint-pub bibcodes are switched if but is being matched…
  • Loading branch information
golnazads authored Dec 30, 2024
2 parents b875e36 + 6cdb721 commit 281c490
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 56 deletions.
4 changes: 4 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -832,3 +832,7 @@
ORACLE_MAX_RECORDS_ADD = 100
# number of records that can be deleted in one call
ORACLE_MAX_RECORDS_DEL = 100


ORACLE_DOCTYPE_EPRINT = 'eprint'
ORACLE_DOCTYPE_PUB = 'article'
19 changes: 11 additions & 8 deletions oraclesrv/doc_matching.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from flask import current_app

from oraclesrv.utils import get_solr_data_match, get_solr_data_match_doi, get_solr_data_match_doctype_case, get_solr_data_match_pubnote
from oraclesrv.utils import get_solr_data_match, get_solr_data_match_doi, get_solr_data_match_doctype_case, \
get_solr_data_match_pubnote, add_a_record, is_eprint_bibcode
from oraclesrv.score import clean_metadata, get_matches, encode_author, format_author, get_doi_match, get_db_match
from oraclesrv.utils import add_a_record

def get_requests_params(payload, param, default_value=None, default_type=str):
"""
Expand Down Expand Up @@ -43,6 +43,9 @@ def __init__(self, payload, save=True):
self.save_to_db = save
self.extra_filter = get_requests_params(payload, 'extra_filter')

if not self.doctype:
self.doctype = current_app.config['ORACLE_DOCTYPE_EPRINT'] if is_eprint_bibcode(self.source_bibcode) else current_app.config['ORACLE_DOCTYPE_PUB']

def create_and_return_response(self, match, query, comment=None):
"""
Expand Down Expand Up @@ -70,7 +73,7 @@ def query_doctype(self, comment):
results, query, solr_status_code = get_solr_data_match_doctype_case(self.author, self.year, self.doctype, '"%s"' % '" OR "'.join(self.match_doctype))
# if any records from solr
if isinstance(results, list) and len(results) > 0:
match = get_matches(self.source_bibcode, self.abstract, self.title, self.author, self.year, None, results)
match = get_matches(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, None, results)
if not match:
current_app.logger.debug('No result from solr for %s.'%doctype)
comment += ' No result from solr for %s.'%doctype
Expand All @@ -92,7 +95,7 @@ def query_doi(self, comment):
# if any records from solr
# compute the score, if score is 0 doi was wrong, so continue on to query using similar
if isinstance(results, list) and len(results) > 0:
match = get_doi_match(self.source_bibcode, self.abstract, self.title, self.author, self.year, self.doi, results)
match = get_doi_match(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, self.doi, results)
if match:
return self.create_and_return_response(match, query), ''
else:
Expand All @@ -117,7 +120,7 @@ def query_pubnote(self, comment):
# if any records from solr
# compute the score, if score is 0 doi was wrong, so continue on to query using similar
if isinstance(results, list) and len(results) > 0:
match = get_doi_match(self.source_bibcode, self.abstract, self.title, self.author, self.year, self.doi, results)
match = get_doi_match(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, self.doi, results)
if match:
return self.create_and_return_response(match, query), ''
else:
Expand Down Expand Up @@ -150,7 +153,7 @@ def query_abstract_or_title(self, comment):
return self.create_and_return_response([], query, 'status code: %d' % solr_status_code)
# got records from solr, see if we can get a match
else:
match = get_matches(self.source_bibcode, self.abstract, self.title, self.author, self.year, self.doi, results)
match = get_matches(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, self.doi, results)
if len(match) > 0:
return self.create_and_return_response(match, query, comment)
# otherwise if no match with abstract, and we think we should have this in solr
Expand Down Expand Up @@ -178,7 +181,7 @@ def query_abstract_or_title(self, comment):
return self.create_and_return_response(match='', query=query, comment=comment)

# got results with title, see if it can be matched
match = get_matches(self.source_bibcode, self.abstract, self.title, self.author, self.year, None, results)
match = get_matches(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, None, results)
return self.create_and_return_response(match, query, comment)

def save_match(self, result):
Expand Down Expand Up @@ -247,7 +250,7 @@ def process(self):
self.match_doctype = ' OR '.join(self.match_doctype)

# if doi is available from the eprint try query on doi first
if self.doi and self.doctype == 'eprint':
if self.doi and self.doctype == current_app.config['ORACLE_DOCTYPE_EPRINT']:
result, comment = self.query_doi(comment)
if result:
self.save_match(result)
Expand Down
6 changes: 4 additions & 2 deletions oraclesrv/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@

import re

from flask import current_app

from sqlalchemy import Float, String, Column, DateTime, func
from sqlalchemy.ext.declarative import declarative_base

Expand Down Expand Up @@ -56,10 +58,10 @@ def set_eprint_bibcode(self, source_bibcode, matched_bibcode, source_bibcode_doc
:return:
"""
if source_bibcode_doctype:
if source_bibcode_doctype == 'eprint':
if source_bibcode_doctype == current_app.config['ORACLE_DOCTYPE_EPRINT']:
self.eprint_bibcode = source_bibcode
return self.eprint_bibcode
if source_bibcode_doctype == 'article':
if source_bibcode_doctype == current_app.config['ORACLE_DOCTYPE_PUB']:
self.eprint_bibcode = matched_bibcode
return self.eprint_bibcode

Expand Down
22 changes: 15 additions & 7 deletions oraclesrv/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from oraclesrv.utils import get_a_record, get_a_matched_record
from oraclesrv.keras_model import KerasModel
from oraclesrv.models import DocMatch

confidence_model = KerasModel()

Expand Down Expand Up @@ -113,11 +114,11 @@ def get_refereed_score(is_refereed):
return current_app.config['ORACLE_SERVICE_REFEREED_SCORE']
return current_app.config['ORACLE_SERVICE_NOT_REFEREED_SCORE']

re_match_arXiv = re.compile(r'(\d\d\d\darXiv.*)')
def get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs):
def get_matches(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs):
"""
:param source_bibcode:
:param doctype:
:param abstract:
:param title:
:param author:
Expand Down Expand Up @@ -177,7 +178,7 @@ def get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs
# if we are matching with eprints, consider eprint a refereed manuscript
# else check the flag for refereed in the property field
# if not refereed we want to penalize the confidence score
match_refereed = True if 'eprint' in doc.get('doctype') else (True if 'REFEREED' in doc.get('property', []) else False)
match_refereed = True if current_app.config['ORACLE_DOCTYPE_EPRINT'] in doc.get('doctype') else (True if 'REFEREED' in doc.get('property', []) else False)
confidence = float(confidence_format % (confidence_model.predict(scores) * get_refereed_score(match_refereed)))

# see if either of these bibcodes have already been matched
Expand Down Expand Up @@ -209,9 +210,15 @@ def get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs
elif (source_bibcode in prev_bibcodes or match_bibcode in prev_bibcodes) and prev_confidence > confidence:
confidence = prev_confidence
scores = []
# either or both have been matched, so use the previous match
# find out if source bibcode is an eprint to assing it correctly
if not (source_bibcode in prev_bibcodes and match_bibcode in prev_bibcodes):
source_bibcode = prev_match['eprint_bibcode']
match_bibcode = prev_match['pub_bibcode']
if doctype == current_app.config['ORACLE_DOCTYPE_EPRINT']:
source_bibcode = prev_match['eprint_bibcode']
match_bibcode = prev_match['pub_bibcode']
elif doctype == current_app.config['ORACLE_DOCTYPE_PUB']:
source_bibcode = prev_match['pub_bibcode']
match_bibcode = prev_match['eprint_bibcode']

result = {'source_bibcode': source_bibcode, 'matched_bibcode': match_bibcode,
'confidence': confidence, 'matched': int(confidence > 0.5),
Expand All @@ -237,10 +244,11 @@ def get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs

return []

def get_doi_match(source_bibcode, abstract, title, author, year, doi, matched_docs):
def get_doi_match(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs):
"""
:param source_bibcode:
:param doctype:
:param abstract:
:param title:
:param author:
Expand All @@ -249,7 +257,7 @@ def get_doi_match(source_bibcode, abstract, title, author, year, doi, matched_do
:param matched_docs:
:return:
"""
results = get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs)
results = get_matches(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs)
if len(results) == 1:
return results
return []
Expand Down
34 changes: 10 additions & 24 deletions oraclesrv/tests/unittests/test_oracle_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from oraclesrv.tests.unittests.base import TestCaseDatabase
from oraclesrv.utils import get_a_record, del_records, add_a_record, query_docmatch, query_source_score, lookup_confidence, \
get_a_matched_record, query_docmatch, query_source_score, lookup_confidence, delete_tmp_matches, replace_tmp_with_canonical, \
delete_multi_matches, clean_db, get_tmp_bibcodes, get_muti_matches, add_records, get_solr_data_chunk
delete_multi_matches, clean_db, get_tmp_bibcodes, get_muti_matches, add_records, get_solr_data_chunk, is_eprint_bibcode
from oraclesrv.score import get_matches, get_doi_match
from oraclesrv.models import DocMatch, ConfidenceLookup, EPrintBibstemLookup

Expand Down Expand Up @@ -172,6 +172,7 @@ def test_docmatch(self):
title = 'Nonlinear corrections in the quantization of a weakly nonideal Bose gas at zero temperature. II. The general case'
author = 'Smolyakov, Mikhail N.'
year = 2022
doctype = 'eprint'
matched_docs = [{'bibcode': '2021CSF...15311505S',
'abstract': 'In the present paper, quantization of a weakly nonideal Bose gas at zero temperature along the lines of the well-known Bogolyubov approach is performed. The analysis presented in this paper is based, in addition to the steps of the original Bogolyubov approach, on the use of nonoscillation modes (which are also solutions of the linearized Heisenberg equation) for recovering the canonical commutation relations in the linear approximation, as well as on the calculation of the first nonlinear correction to the solution of the linearized Heisenberg equation which satisfies the canonical commutation relations at the next order. It is shown that, at least in the case of free quasi-particles, consideration of the nonlinear correction automatically solves the problem of nonconserved particle number, which is inherent to the original approach.',
'author_norm': ['Smolyakov, M'],
Expand Down Expand Up @@ -231,7 +232,7 @@ def test_docmatch(self):
'confidence': 0.7142998,
'matched': 1,
'scores': {'abstract': 0.76, 'title': 0.98, 'author': 1, 'year': 1}}
matches = get_matches(source_bibcode, abstract, title, author, year, None, matched_docs)
matches = get_matches(source_bibcode, doctype, abstract, title, author, year, None, matched_docs)
self.assertEqual(len(matches), 1)
self.assertDictEqual(matches[0], best_match)

Expand All @@ -248,7 +249,7 @@ def test_docmatch(self):
'confidence': 0.9829099,
'matched': 1,
'scores': {}}
matches = get_matches(source_bibcode, abstract, title, author, year, None, matched_docs)
matches = get_matches(source_bibcode, doctype, abstract, title, author, year, None, matched_docs)
self.assertEqual(len(matches), 1)
self.assertDictEqual(matches[0], best_match)

Expand Down Expand Up @@ -283,7 +284,7 @@ def test_docmatch_changed_bibcode(self):
'title':['Numerical investigation of non-Gaussianities in the phase and modulus of density Fourier modes'],
'year':'2022'}]
# match it
matches = get_doi_match(source_doc['bibcode'], source_doc['abstract'], source_doc['title'],
matches = get_doi_match(source_doc['bibcode'], source_doc['doctype'], source_doc['abstract'], source_doc['title'],
source_doc['author'], source_doc['year'], source_doc['doi'], matched_docs)
# current match the same as prev with the new bibcode
current_match = {'source_bibcode': '2021arXiv210911714Q',
Expand Down Expand Up @@ -1138,29 +1139,14 @@ def test_get_solr_data_chunk(self):
self.assertIsNone(result)
self.assertIsInstance(status_code, requests.exceptions.RequestException)

def add_docmatch_data(self):
def test_is_eprint_bibcode(self):
"""
Add docmatch data
Test is_eprint_bibcode function
"""
self.add_eprint_bibstem_lookup_data()

docmatch_data = [
('2021arXiv210312030S', '2021CSF...15311505S', 0.9829099),
('2017arXiv171111082H', '2018ConPh..59...16H', 0.9877064),
('2018arXiv181105526S', '2022NuPhB.98015830S', 0.97300124),
]

docmatch_records = []
for record in docmatch_data:
docmatch_record = {'source_bibcode': record[0],
'matched_bibcode': record[1],
'confidence': record[2]}
docmatch_records.append(docmatch_record)
self.add_docmatch_data()

headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
response = self.client.put('/add', data=json.dumps(docmatch_records), headers=headers)
self.assertEqual(response._status_code, 200)
self.assertEqual(response.json['status'], 'updated db with new data successfully')
self.assertTrue(is_eprint_bibcode('2021arXiv210312030S'))
self.assertFalse(is_eprint_bibcode('2021CSF...15311505S'))


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 281c490

Please sign in to comment.