Merge pull request #82 from golnazads/master

fixed a bug, eprint-pub bibcodes are switched if but is being matched…
adsabs · Dec 30, 2024 · 281c490 · 281c490
2 parents b875e36 + 6cdb721
commit 281c490
Show file tree

Hide file tree

Showing 7 changed files with 132 additions and 56 deletions.
diff --git a/config.py b/config.py
@@ -832,3 +832,7 @@
 ORACLE_MAX_RECORDS_ADD = 100
 # number of records that can be deleted in one call
 ORACLE_MAX_RECORDS_DEL = 100
+
+
+ORACLE_DOCTYPE_EPRINT = 'eprint'
+ORACLE_DOCTYPE_PUB = 'article'
diff --git a/oraclesrv/doc_matching.py b/oraclesrv/doc_matching.py
@@ -1,8 +1,8 @@
 from flask import current_app
 
-from oraclesrv.utils import get_solr_data_match, get_solr_data_match_doi, get_solr_data_match_doctype_case, get_solr_data_match_pubnote
+from oraclesrv.utils import get_solr_data_match, get_solr_data_match_doi, get_solr_data_match_doctype_case, \
+    get_solr_data_match_pubnote, add_a_record, is_eprint_bibcode
 from oraclesrv.score import clean_metadata, get_matches, encode_author, format_author, get_doi_match, get_db_match
-from oraclesrv.utils import add_a_record
 
 def get_requests_params(payload, param, default_value=None, default_type=str):
     """
@@ -43,6 +43,9 @@ def __init__(self, payload, save=True):
         self.save_to_db = save
         self.extra_filter = get_requests_params(payload, 'extra_filter')
 
+        if not self.doctype:
+            self.doctype = current_app.config['ORACLE_DOCTYPE_EPRINT'] if is_eprint_bibcode(self.source_bibcode) else current_app.config['ORACLE_DOCTYPE_PUB']
+
     def create_and_return_response(self, match, query, comment=None):
         """
 
@@ -70,7 +73,7 @@ def query_doctype(self, comment):
         results, query, solr_status_code = get_solr_data_match_doctype_case(self.author, self.year, self.doctype, '"%s"' % '" OR "'.join(self.match_doctype))
         # if any records from solr
         if isinstance(results, list) and len(results) > 0:
-            match = get_matches(self.source_bibcode, self.abstract, self.title, self.author, self.year, None, results)
+            match = get_matches(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, None, results)
             if not match:
                 current_app.logger.debug('No result from solr for %s.'%doctype)
                 comment += ' No result from solr for %s.'%doctype
@@ -92,7 +95,7 @@ def query_doi(self, comment):
         # if any records from solr
         # compute the score, if score is 0 doi was wrong, so continue on to query using similar
         if isinstance(results, list) and len(results) > 0:
-            match = get_doi_match(self.source_bibcode, self.abstract, self.title, self.author, self.year, self.doi, results)
+            match = get_doi_match(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, self.doi, results)
             if match:
                 return self.create_and_return_response(match, query), ''
             else:
@@ -117,7 +120,7 @@ def query_pubnote(self, comment):
         # if any records from solr
         # compute the score, if score is 0 doi was wrong, so continue on to query using similar
         if isinstance(results, list) and len(results) > 0:
-            match = get_doi_match(self.source_bibcode, self.abstract, self.title, self.author, self.year, self.doi, results)
+            match = get_doi_match(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, self.doi, results)
             if match:
                 return self.create_and_return_response(match, query), ''
             else:
@@ -150,7 +153,7 @@ def query_abstract_or_title(self, comment):
                 return self.create_and_return_response([], query, 'status code: %d' % solr_status_code)
         # got records from solr, see if we can get a match
         else:
-            match = get_matches(self.source_bibcode, self.abstract, self.title, self.author, self.year, self.doi, results)
+            match = get_matches(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, self.doi, results)
             if len(match) > 0:
                 return self.create_and_return_response(match, query, comment)
             # otherwise if no match with abstract, and we think we should have this in solr
@@ -178,7 +181,7 @@ def query_abstract_or_title(self, comment):
             return self.create_and_return_response(match='', query=query, comment=comment)
 
         # got results with title, see if it can be matched
-        match = get_matches(self.source_bibcode, self.abstract, self.title, self.author, self.year, None, results)
+        match = get_matches(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, None, results)
         return self.create_and_return_response(match, query, comment)
 
     def save_match(self, result):
@@ -247,7 +250,7 @@ def process(self):
         self.match_doctype = ' OR '.join(self.match_doctype)
 
         # if doi is available from the eprint try query on doi first
-        if self.doi and self.doctype == 'eprint':
+        if self.doi and self.doctype == current_app.config['ORACLE_DOCTYPE_EPRINT']:
             result, comment = self.query_doi(comment)
             if result:
                 self.save_match(result)

diff --git a/oraclesrv/models.py b/oraclesrv/models.py
@@ -1,6 +1,8 @@
 
 import re
 
+from flask import current_app
+
 from sqlalchemy import Float, String, Column, DateTime, func
 from sqlalchemy.ext.declarative import declarative_base
 
@@ -56,10 +58,10 @@ def set_eprint_bibcode(self, source_bibcode, matched_bibcode, source_bibcode_doc
         :return:
         """
         if source_bibcode_doctype:
-            if source_bibcode_doctype == 'eprint':
+            if source_bibcode_doctype == current_app.config['ORACLE_DOCTYPE_EPRINT']:
                 self.eprint_bibcode = source_bibcode
                 return self.eprint_bibcode
-            if source_bibcode_doctype == 'article':
+            if source_bibcode_doctype == current_app.config['ORACLE_DOCTYPE_PUB']:
                 self.eprint_bibcode = matched_bibcode
                 return self.eprint_bibcode
 

diff --git a/oraclesrv/score.py b/oraclesrv/score.py
@@ -15,6 +15,7 @@
 
 from oraclesrv.utils import get_a_record, get_a_matched_record
 from oraclesrv.keras_model import KerasModel
+from oraclesrv.models import DocMatch
 
 confidence_model = KerasModel()
 
@@ -113,11 +114,11 @@ def get_refereed_score(is_refereed):
         return current_app.config['ORACLE_SERVICE_REFEREED_SCORE']
     return current_app.config['ORACLE_SERVICE_NOT_REFEREED_SCORE']
 
-re_match_arXiv = re.compile(r'(\d\d\d\darXiv.*)')
-def get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs):
+def get_matches(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs):
     """
 
     :param source_bibcode:
+    :param doctype:
     :param abstract:
     :param title:
     :param author:
@@ -177,7 +178,7 @@ def get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs
         # if we are matching with eprints, consider eprint a refereed manuscript
         # else check the flag for refereed in the property field
         # if not refereed we want to penalize the confidence score
-        match_refereed = True if 'eprint' in doc.get('doctype') else (True if 'REFEREED' in doc.get('property', []) else False)
+        match_refereed = True if current_app.config['ORACLE_DOCTYPE_EPRINT'] in doc.get('doctype') else (True if 'REFEREED' in doc.get('property', []) else False)
         confidence = float(confidence_format % (confidence_model.predict(scores) * get_refereed_score(match_refereed)))
 
         # see if either of these bibcodes have already been matched
@@ -209,9 +210,15 @@ def get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs
             elif (source_bibcode in prev_bibcodes or match_bibcode in prev_bibcodes) and prev_confidence > confidence:
                 confidence = prev_confidence
                 scores = []
+                # either or both have been matched, so use the previous match
+                # find out if source bibcode is an eprint to assing it correctly
                 if not (source_bibcode in prev_bibcodes and match_bibcode in prev_bibcodes):
-                    source_bibcode = prev_match['eprint_bibcode']
-                    match_bibcode = prev_match['pub_bibcode']
+                    if doctype == current_app.config['ORACLE_DOCTYPE_EPRINT']:
+                        source_bibcode = prev_match['eprint_bibcode']
+                        match_bibcode = prev_match['pub_bibcode']
+                    elif doctype == current_app.config['ORACLE_DOCTYPE_PUB']:
+                        source_bibcode = prev_match['pub_bibcode']
+                        match_bibcode = prev_match['eprint_bibcode']
 
         result = {'source_bibcode': source_bibcode, 'matched_bibcode': match_bibcode,
                   'confidence': confidence, 'matched': int(confidence > 0.5),
@@ -237,10 +244,11 @@ def get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs
 
     return []
 
-def get_doi_match(source_bibcode, abstract, title, author, year, doi, matched_docs):
+def get_doi_match(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs):
     """
 
     :param source_bibcode:
+    :param doctype:
     :param abstract:
     :param title:
     :param author:
@@ -249,7 +257,7 @@ def get_doi_match(source_bibcode, abstract, title, author, year, doi, matched_do
     :param matched_docs:
     :return:
     """
-    results = get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs)
+    results = get_matches(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs)
     if len(results) == 1:
         return results
     return []

diff --git a/oraclesrv/tests/unittests/test_oracle_db.py b/oraclesrv/tests/unittests/test_oracle_db.py
@@ -15,7 +15,7 @@
 from oraclesrv.tests.unittests.base import TestCaseDatabase
 from oraclesrv.utils import get_a_record, del_records, add_a_record, query_docmatch, query_source_score, lookup_confidence, \
     get_a_matched_record, query_docmatch, query_source_score, lookup_confidence, delete_tmp_matches, replace_tmp_with_canonical, \
-    delete_multi_matches, clean_db, get_tmp_bibcodes, get_muti_matches, add_records, get_solr_data_chunk
+    delete_multi_matches, clean_db, get_tmp_bibcodes, get_muti_matches, add_records, get_solr_data_chunk, is_eprint_bibcode
 from oraclesrv.score import get_matches, get_doi_match
 from oraclesrv.models import DocMatch, ConfidenceLookup, EPrintBibstemLookup
 
@@ -172,6 +172,7 @@ def test_docmatch(self):
         title = 'Nonlinear corrections in the quantization of a weakly nonideal Bose gas   at zero temperature. II. The general case'
         author = 'Smolyakov, Mikhail N.'
         year = 2022
+        doctype = 'eprint'
         matched_docs = [{'bibcode': '2021CSF...15311505S',
                          'abstract': 'In the present paper, quantization of a weakly nonideal Bose gas at zero temperature along the lines of the well-known Bogolyubov approach is performed. The analysis presented in this paper is based, in addition to the steps of the original Bogolyubov approach, on the use of nonoscillation modes (which are also solutions of the linearized Heisenberg equation) for recovering the canonical commutation relations in the linear approximation, as well as on the calculation of the first nonlinear correction to the solution of the linearized Heisenberg equation which satisfies the canonical commutation relations at the next order. It is shown that, at least in the case of free quasi-particles, consideration of the nonlinear correction automatically solves the problem of nonconserved particle number, which is inherent to the original approach.',
                          'author_norm': ['Smolyakov, M'],
@@ -231,7 +232,7 @@ def test_docmatch(self):
                       'confidence': 0.7142998,
                       'matched': 1,
                       'scores': {'abstract': 0.76, 'title': 0.98, 'author': 1, 'year': 1}}
-        matches = get_matches(source_bibcode, abstract, title, author, year, None, matched_docs)
+        matches = get_matches(source_bibcode, doctype, abstract, title, author, year, None, matched_docs)
         self.assertEqual(len(matches), 1)
         self.assertDictEqual(matches[0], best_match)
 
@@ -248,7 +249,7 @@ def test_docmatch(self):
                       'confidence': 0.9829099,
                       'matched': 1,
                       'scores': {}}
-        matches = get_matches(source_bibcode, abstract, title, author, year, None, matched_docs)
+        matches = get_matches(source_bibcode, doctype, abstract, title, author, year, None, matched_docs)
         self.assertEqual(len(matches), 1)
         self.assertDictEqual(matches[0], best_match)
 
@@ -283,7 +284,7 @@ def test_docmatch_changed_bibcode(self):
                          'title':['Numerical investigation of non-Gaussianities in the phase and modulus of density Fourier modes'],
                          'year':'2022'}]
         # match it
-        matches = get_doi_match(source_doc['bibcode'], source_doc['abstract'], source_doc['title'],
+        matches = get_doi_match(source_doc['bibcode'], source_doc['doctype'], source_doc['abstract'], source_doc['title'],
                               source_doc['author'], source_doc['year'], source_doc['doi'], matched_docs)
         # current match the same as prev with the new bibcode
         current_match = {'source_bibcode': '2021arXiv210911714Q',
@@ -1138,29 +1139,14 @@ def test_get_solr_data_chunk(self):
             self.assertIsNone(result)
             self.assertIsInstance(status_code, requests.exceptions.RequestException)
 
-    def add_docmatch_data(self):
+    def test_is_eprint_bibcode(self):
         """
-        Add docmatch data
+        Test is_eprint_bibcode function
         """
-        self.add_eprint_bibstem_lookup_data()
-
-        docmatch_data = [
-                        ('2021arXiv210312030S', '2021CSF...15311505S', 0.9829099),
-                        ('2017arXiv171111082H', '2018ConPh..59...16H', 0.9877064),
-                        ('2018arXiv181105526S', '2022NuPhB.98015830S', 0.97300124),
-        ]
-
-        docmatch_records = []
-        for record in docmatch_data:
-            docmatch_record = {'source_bibcode': record[0],
-                               'matched_bibcode': record[1],
-                               'confidence': record[2]}
-            docmatch_records.append(docmatch_record)
+        self.add_docmatch_data()
 
-        headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
-        response = self.client.put('/add', data=json.dumps(docmatch_records), headers=headers)
-        self.assertEqual(response._status_code, 200)
-        self.assertEqual(response.json['status'], 'updated db with new data successfully')
+        self.assertTrue(is_eprint_bibcode('2021arXiv210312030S'))
+        self.assertFalse(is_eprint_bibcode('2021CSF...15311505S'))
 
 
 if __name__ == "__main__":