From 6cdb721cd012e71b54c70c409c769d0218a62eb1 Mon Sep 17 00:00:00 2001
From: golnazads <28757512+golnazads@users.noreply.github.com>
Date: Mon, 30 Dec 2024 09:48:16 -0500
Subject: [PATCH] fixed a bug, eprint-pub bibcodes are switched if but is being
 matched and the match is in db

---
 config.py                                     |  4 +
 oraclesrv/doc_matching.py                     | 19 ++--
 oraclesrv/models.py                           |  6 +-
 oraclesrv/score.py                            | 22 +++--
 oraclesrv/tests/unittests/test_oracle_db.py   | 34 +++----
 .../tests/unittests/test_oracle_service.py    | 88 +++++++++++++++----
 oraclesrv/utils.py                            | 15 ++++
 7 files changed, 132 insertions(+), 56 deletions(-)

diff --git a/config.py b/config.py
index 8a7b29a..4468771 100644
--- a/config.py
+++ b/config.py
@@ -832,3 +832,7 @@
 ORACLE_MAX_RECORDS_ADD = 100
 # number of records that can be deleted in one call
 ORACLE_MAX_RECORDS_DEL = 100
+
+
+ORACLE_DOCTYPE_EPRINT = 'eprint'
+ORACLE_DOCTYPE_PUB = 'article'
diff --git a/oraclesrv/doc_matching.py b/oraclesrv/doc_matching.py
index 9605977..be4e9f7 100644
--- a/oraclesrv/doc_matching.py
+++ b/oraclesrv/doc_matching.py
@@ -1,8 +1,8 @@
 from flask import current_app
 
-from oraclesrv.utils import get_solr_data_match, get_solr_data_match_doi, get_solr_data_match_doctype_case, get_solr_data_match_pubnote
+from oraclesrv.utils import get_solr_data_match, get_solr_data_match_doi, get_solr_data_match_doctype_case, \
+    get_solr_data_match_pubnote, add_a_record, is_eprint_bibcode
 from oraclesrv.score import clean_metadata, get_matches, encode_author, format_author, get_doi_match, get_db_match
-from oraclesrv.utils import add_a_record
 
 def get_requests_params(payload, param, default_value=None, default_type=str):
     """
@@ -43,6 +43,9 @@ def __init__(self, payload, save=True):
         self.save_to_db = save
         self.extra_filter = get_requests_params(payload, 'extra_filter')
 
+        if not self.doctype:
+            self.doctype = current_app.config['ORACLE_DOCTYPE_EPRINT'] if is_eprint_bibcode(self.source_bibcode) else current_app.config['ORACLE_DOCTYPE_PUB']
+
     def create_and_return_response(self, match, query, comment=None):
         """
 
@@ -70,7 +73,7 @@ def query_doctype(self, comment):
         results, query, solr_status_code = get_solr_data_match_doctype_case(self.author, self.year, self.doctype, '"%s"' % '" OR "'.join(self.match_doctype))
         # if any records from solr
         if isinstance(results, list) and len(results) > 0:
-            match = get_matches(self.source_bibcode, self.abstract, self.title, self.author, self.year, None, results)
+            match = get_matches(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, None, results)
             if not match:
                 current_app.logger.debug('No result from solr for %s.'%doctype)
                 comment += ' No result from solr for %s.'%doctype
@@ -92,7 +95,7 @@ def query_doi(self, comment):
         # if any records from solr
         # compute the score, if score is 0 doi was wrong, so continue on to query using similar
         if isinstance(results, list) and len(results) > 0:
-            match = get_doi_match(self.source_bibcode, self.abstract, self.title, self.author, self.year, self.doi, results)
+            match = get_doi_match(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, self.doi, results)
             if match:
                 return self.create_and_return_response(match, query), ''
             else:
@@ -117,7 +120,7 @@ def query_pubnote(self, comment):
         # if any records from solr
         # compute the score, if score is 0 doi was wrong, so continue on to query using similar
         if isinstance(results, list) and len(results) > 0:
-            match = get_doi_match(self.source_bibcode, self.abstract, self.title, self.author, self.year, self.doi, results)
+            match = get_doi_match(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, self.doi, results)
             if match:
                 return self.create_and_return_response(match, query), ''
             else:
@@ -150,7 +153,7 @@ def query_abstract_or_title(self, comment):
                 return self.create_and_return_response([], query, 'status code: %d' % solr_status_code)
         # got records from solr, see if we can get a match
         else:
-            match = get_matches(self.source_bibcode, self.abstract, self.title, self.author, self.year, self.doi, results)
+            match = get_matches(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, self.doi, results)
             if len(match) > 0:
                 return self.create_and_return_response(match, query, comment)
             # otherwise if no match with abstract, and we think we should have this in solr
@@ -178,7 +181,7 @@ def query_abstract_or_title(self, comment):
             return self.create_and_return_response(match='', query=query, comment=comment)
 
         # got results with title, see if it can be matched
-        match = get_matches(self.source_bibcode, self.abstract, self.title, self.author, self.year, None, results)
+        match = get_matches(self.source_bibcode, self.doctype, self.abstract, self.title, self.author, self.year, None, results)
         return self.create_and_return_response(match, query, comment)
 
     def save_match(self, result):
@@ -247,7 +250,7 @@ def process(self):
         self.match_doctype = ' OR '.join(self.match_doctype)
 
         # if doi is available from the eprint try query on doi first
-        if self.doi and self.doctype == 'eprint':
+        if self.doi and self.doctype == current_app.config['ORACLE_DOCTYPE_EPRINT']:
             result, comment = self.query_doi(comment)
             if result:
                 self.save_match(result)
diff --git a/oraclesrv/models.py b/oraclesrv/models.py
index 4891428..34c227f 100644
--- a/oraclesrv/models.py
+++ b/oraclesrv/models.py
@@ -1,6 +1,8 @@
 
 import re
 
+from flask import current_app
+
 from sqlalchemy import Float, String, Column, DateTime, func
 from sqlalchemy.ext.declarative import declarative_base
 
@@ -56,10 +58,10 @@ def set_eprint_bibcode(self, source_bibcode, matched_bibcode, source_bibcode_doc
         :return:
         """
         if source_bibcode_doctype:
-            if source_bibcode_doctype == 'eprint':
+            if source_bibcode_doctype == current_app.config['ORACLE_DOCTYPE_EPRINT']:
                 self.eprint_bibcode = source_bibcode
                 return self.eprint_bibcode
-            if source_bibcode_doctype == 'article':
+            if source_bibcode_doctype == current_app.config['ORACLE_DOCTYPE_PUB']:
                 self.eprint_bibcode = matched_bibcode
                 return self.eprint_bibcode
 
diff --git a/oraclesrv/score.py b/oraclesrv/score.py
index 5a5bd02..971a41f 100644
--- a/oraclesrv/score.py
+++ b/oraclesrv/score.py
@@ -15,6 +15,7 @@
 
 from oraclesrv.utils import get_a_record, get_a_matched_record
 from oraclesrv.keras_model import KerasModel
+from oraclesrv.models import DocMatch
 
 confidence_model = KerasModel()
 
@@ -113,11 +114,11 @@ def get_refereed_score(is_refereed):
         return current_app.config['ORACLE_SERVICE_REFEREED_SCORE']
     return current_app.config['ORACLE_SERVICE_NOT_REFEREED_SCORE']
 
-re_match_arXiv = re.compile(r'(\d\d\d\darXiv.*)')
-def get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs):
+def get_matches(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs):
     """
 
     :param source_bibcode:
+    :param doctype:
     :param abstract:
     :param title:
     :param author:
@@ -177,7 +178,7 @@ def get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs
         # if we are matching with eprints, consider eprint a refereed manuscript
         # else check the flag for refereed in the property field
         # if not refereed we want to penalize the confidence score
-        match_refereed = True if 'eprint' in doc.get('doctype') else (True if 'REFEREED' in doc.get('property', []) else False)
+        match_refereed = True if current_app.config['ORACLE_DOCTYPE_EPRINT'] in doc.get('doctype') else (True if 'REFEREED' in doc.get('property', []) else False)
         confidence = float(confidence_format % (confidence_model.predict(scores) * get_refereed_score(match_refereed)))
 
         # see if either of these bibcodes have already been matched
@@ -209,9 +210,15 @@ def get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs
             elif (source_bibcode in prev_bibcodes or match_bibcode in prev_bibcodes) and prev_confidence > confidence:
                 confidence = prev_confidence
                 scores = []
+                # either or both have been matched, so use the previous match
+                # find out if source bibcode is an eprint to assing it correctly
                 if not (source_bibcode in prev_bibcodes and match_bibcode in prev_bibcodes):
-                    source_bibcode = prev_match['eprint_bibcode']
-                    match_bibcode = prev_match['pub_bibcode']
+                    if doctype == current_app.config['ORACLE_DOCTYPE_EPRINT']:
+                        source_bibcode = prev_match['eprint_bibcode']
+                        match_bibcode = prev_match['pub_bibcode']
+                    elif doctype == current_app.config['ORACLE_DOCTYPE_PUB']:
+                        source_bibcode = prev_match['pub_bibcode']
+                        match_bibcode = prev_match['eprint_bibcode']
 
         result = {'source_bibcode': source_bibcode, 'matched_bibcode': match_bibcode,
                   'confidence': confidence, 'matched': int(confidence > 0.5),
@@ -237,10 +244,11 @@ def get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs
 
     return []
 
-def get_doi_match(source_bibcode, abstract, title, author, year, doi, matched_docs):
+def get_doi_match(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs):
     """
 
     :param source_bibcode:
+    :param doctype:
     :param abstract:
     :param title:
     :param author:
@@ -249,7 +257,7 @@ def get_doi_match(source_bibcode, abstract, title, author, year, doi, matched_do
     :param matched_docs:
     :return:
     """
-    results = get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs)
+    results = get_matches(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs)
     if len(results) == 1:
         return results
     return []
diff --git a/oraclesrv/tests/unittests/test_oracle_db.py b/oraclesrv/tests/unittests/test_oracle_db.py
index d1246a2..7dc6b19 100644
--- a/oraclesrv/tests/unittests/test_oracle_db.py
+++ b/oraclesrv/tests/unittests/test_oracle_db.py
@@ -15,7 +15,7 @@
 from oraclesrv.tests.unittests.base import TestCaseDatabase
 from oraclesrv.utils import get_a_record, del_records, add_a_record, query_docmatch, query_source_score, lookup_confidence, \
     get_a_matched_record, query_docmatch, query_source_score, lookup_confidence, delete_tmp_matches, replace_tmp_with_canonical, \
-    delete_multi_matches, clean_db, get_tmp_bibcodes, get_muti_matches, add_records, get_solr_data_chunk
+    delete_multi_matches, clean_db, get_tmp_bibcodes, get_muti_matches, add_records, get_solr_data_chunk, is_eprint_bibcode
 from oraclesrv.score import get_matches, get_doi_match
 from oraclesrv.models import DocMatch, ConfidenceLookup, EPrintBibstemLookup
 
@@ -172,6 +172,7 @@ def test_docmatch(self):
         title = 'Nonlinear corrections in the quantization of a weakly nonideal Bose gas   at zero temperature. II. The general case'
         author = 'Smolyakov, Mikhail N.'
         year = 2022
+        doctype = 'eprint'
         matched_docs = [{'bibcode': '2021CSF...15311505S',
                          'abstract': 'In the present paper, quantization of a weakly nonideal Bose gas at zero temperature along the lines of the well-known Bogolyubov approach is performed. The analysis presented in this paper is based, in addition to the steps of the original Bogolyubov approach, on the use of nonoscillation modes (which are also solutions of the linearized Heisenberg equation) for recovering the canonical commutation relations in the linear approximation, as well as on the calculation of the first nonlinear correction to the solution of the linearized Heisenberg equation which satisfies the canonical commutation relations at the next order. It is shown that, at least in the case of free quasi-particles, consideration of the nonlinear correction automatically solves the problem of nonconserved particle number, which is inherent to the original approach.',
                          'author_norm': ['Smolyakov, M'],
@@ -231,7 +232,7 @@ def test_docmatch(self):
                       'confidence': 0.7142998,
                       'matched': 1,
                       'scores': {'abstract': 0.76, 'title': 0.98, 'author': 1, 'year': 1}}
-        matches = get_matches(source_bibcode, abstract, title, author, year, None, matched_docs)
+        matches = get_matches(source_bibcode, doctype, abstract, title, author, year, None, matched_docs)
         self.assertEqual(len(matches), 1)
         self.assertDictEqual(matches[0], best_match)
 
@@ -248,7 +249,7 @@ def test_docmatch(self):
                       'confidence': 0.9829099,
                       'matched': 1,
                       'scores': {}}
-        matches = get_matches(source_bibcode, abstract, title, author, year, None, matched_docs)
+        matches = get_matches(source_bibcode, doctype, abstract, title, author, year, None, matched_docs)
         self.assertEqual(len(matches), 1)
         self.assertDictEqual(matches[0], best_match)
 
@@ -283,7 +284,7 @@ def test_docmatch_changed_bibcode(self):
                          'title':['Numerical investigation of non-Gaussianities in the phase and modulus of density Fourier modes'],
                          'year':'2022'}]
         # match it
-        matches = get_doi_match(source_doc['bibcode'], source_doc['abstract'], source_doc['title'],
+        matches = get_doi_match(source_doc['bibcode'], source_doc['doctype'], source_doc['abstract'], source_doc['title'],
                               source_doc['author'], source_doc['year'], source_doc['doi'], matched_docs)
         # current match the same as prev with the new bibcode
         current_match = {'source_bibcode': '2021arXiv210911714Q',
@@ -1138,29 +1139,14 @@ def test_get_solr_data_chunk(self):
             self.assertIsNone(result)
             self.assertIsInstance(status_code, requests.exceptions.RequestException)
 
-    def add_docmatch_data(self):
+    def test_is_eprint_bibcode(self):
         """
-        Add docmatch data
+        Test is_eprint_bibcode function
         """
-        self.add_eprint_bibstem_lookup_data()
-
-        docmatch_data = [
-                        ('2021arXiv210312030S', '2021CSF...15311505S', 0.9829099),
-                        ('2017arXiv171111082H', '2018ConPh..59...16H', 0.9877064),
-                        ('2018arXiv181105526S', '2022NuPhB.98015830S', 0.97300124),
-        ]
-
-        docmatch_records = []
-        for record in docmatch_data:
-            docmatch_record = {'source_bibcode': record[0],
-                               'matched_bibcode': record[1],
-                               'confidence': record[2]}
-            docmatch_records.append(docmatch_record)
+        self.add_docmatch_data()
 
-        headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
-        response = self.client.put('/add', data=json.dumps(docmatch_records), headers=headers)
-        self.assertEqual(response._status_code, 200)
-        self.assertEqual(response.json['status'], 'updated db with new data successfully')
+        self.assertTrue(is_eprint_bibcode('2021arXiv210312030S'))
+        self.assertFalse(is_eprint_bibcode('2021CSF...15311505S'))
 
 
 if __name__ == "__main__":
diff --git a/oraclesrv/tests/unittests/test_oracle_service.py b/oraclesrv/tests/unittests/test_oracle_service.py
index 4ba29b5..1718d77 100644
--- a/oraclesrv/tests/unittests/test_oracle_service.py
+++ b/oraclesrv/tests/unittests/test_oracle_service.py
@@ -42,6 +42,7 @@ def test_get_matches(self, mock_query_eprint_bibstem):
         author = 'Smolyakov, Mikhail N.'
         year = 2022
         doi = ['10.1016/j.chaos.2021.111505']
+        doctype = 'eprint'
 
         matched_docs = [{'bibcode': '2021CSF...15311505S',
                          'abstract': 'In the present paper, quantization of a weakly nonideal Bose gas at zero temperature along the lines of the well-known Bogolyubov approach is performed. The analysis presented in this paper is based, in addition to the steps of the original Bogolyubov approach, on the use of nonoscillation modes (which are also solutions of the linearized Heisenberg equation) for recovering the canonical commutation relations in the linear approximation, as well as on the calculation of the first nonlinear correction to the solution of the linearized Heisenberg equation which satisfies the canonical commutation relations at the next order. It is shown that, at least in the case of free quasi-particles, consideration of the nonlinear correction automatically solves the problem of nonconserved particle number, which is inherent to the original approach.',
@@ -54,7 +55,7 @@ def test_get_matches(self, mock_query_eprint_bibstem):
                          'property': ['ARTICLE','EPRINT_OPENACCESS','ESOURCE','OPENACCESS','REFEREED']}]
 
         # abstract, no doi
-        match = get_matches(source_bibcode, abstract, title, author, year, None, matched_docs)
+        match = get_matches(source_bibcode, doctype, abstract, title, author, year, None, matched_docs)
         self.assertEqual(len(match), 1)
         self.assertDictEqual(match[0], {'source_bibcode': '2022arXiv220606316S',
                                         'matched_bibcode': '2021CSF...15311505S',
@@ -63,7 +64,7 @@ def test_get_matches(self, mock_query_eprint_bibstem):
                                         'scores': {'abstract': 0.76, 'title': 0.98, 'author': 1, 'year': 1}})
 
         # no abstract, no doi
-        match = get_matches(source_bibcode, '', title, author, year, None, matched_docs)
+        match = get_matches(source_bibcode, doctype, '', title, author, year, None, matched_docs)
         self.assertEqual(len(match), 1)
         self.assertDictEqual(match[0], {'source_bibcode': '2022arXiv220606316S',
                                         'matched_bibcode': '2021CSF...15311505S',
@@ -72,7 +73,7 @@ def test_get_matches(self, mock_query_eprint_bibstem):
                                         'scores': {'abstract': None, 'title': 0.98, 'author': 1, 'year': 1}})
 
         # abstract, doi
-        match = get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs)
+        match = get_matches(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs)
         self.assertEqual(len(match), 1)
         self.assertDictEqual(match[0], {'source_bibcode': '2022arXiv220606316S',
                                         'matched_bibcode': '2021CSF...15311505S',
@@ -81,7 +82,7 @@ def test_get_matches(self, mock_query_eprint_bibstem):
                                         'scores': {'abstract': 0.76, 'title': 0.98, 'author': 1, 'year': 1, 'doi': 1}})
 
         # no abstract, doi
-        match = get_matches(source_bibcode, '', title, author, year, doi, matched_docs)
+        match = get_matches(source_bibcode, doctype, '', title, author, year, doi, matched_docs)
         self.assertEqual(len(match), 1)
         self.assertDictEqual(match[0], {'source_bibcode': '2022arXiv220606316S',
                                         'matched_bibcode': '2021CSF...15311505S',
@@ -109,6 +110,7 @@ def test_get_matches_multi_hits(self, mock_query_eprint_bibstem):
         author = 'Smolyakov, Mikhail N.'
         year = 2022
         doi = ['10.1016/j.chaos.2021.111505']
+        doctype = 'eprint'
 
         # when multiple matches are found, and one record is returned
         matched_docs = [{'bibcode': '2021CSF...15311505S',
@@ -130,7 +132,7 @@ def test_get_matches_multi_hits(self, mock_query_eprint_bibstem):
                          'property': ['ARTICLE', 'ESOURCE', 'NOT REFEREED']}
                         ]
 
-        match = get_matches(source_bibcode, '', title, author, year, doi, matched_docs)
+        match = get_matches(source_bibcode, doctype, '', title, author, year, doi, matched_docs)
         self.assertEqual(len(match), 1)
         self.assertDictEqual(match[0], {'source_bibcode': '2022arXiv220606316S',
                                         'matched_bibcode': '2021CSF...15311505S',
@@ -155,7 +157,7 @@ def test_get_matches_multi_hits(self, mock_query_eprint_bibstem):
                          'property': ['ARTICLE', 'ESOURCE', 'NOT REFEREED']}
                         ]
 
-        match = get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs)
+        match = get_matches(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs)
 
         self.assertEqual(len(match), 2)
         self.assertEqual(match[0], {'source_bibcode': '2022arXiv220606316S',
@@ -171,9 +173,9 @@ def test_get_matches_multi_hits(self, mock_query_eprint_bibstem):
 
     @mock.patch('oraclesrv.score.get_a_record')
     @mock.patch('oraclesrv.utils.query_eprint_bibstem')
-    def test_get_matches_when_prev_match_exist(self, mock_query_eprint_bibstem, mock_get_a_record):
+    def test_get_matches_when_prev_match_exist_source_eprint(self, mock_query_eprint_bibstem, mock_get_a_record):
         """
-        Test get_matches function of the score module when there is a prev match
+        Test get_matches function of the score module when there is a prev match and source bibcode is eprint
         """
         # mock the eprint_bibstem patterns
         mock_query_eprint_bibstem.return_value = (
@@ -190,6 +192,7 @@ def test_get_matches_when_prev_match_exist(self, mock_query_eprint_bibstem, mock
         author = 'Smolyakov, Mikhail N.'
         year = 2022
         doi = ['10.1016/j.chaos.2021.111505']
+        doctype = 'eprint'
 
         matched_docs = [{'bibcode': '2021CSF...15311505S',
                          'abstract': 'In the present paper, quantization of a weakly nonideal Bose gas at zero temperature along the lines of the well-known Bogolyubov approach is performed. The analysis presented in this paper is based, in addition to the steps of the original Bogolyubov approach, on the use of nonoscillation modes (which are also solutions of the linearized Heisenberg equation) for recovering the canonical commutation relations in the linear approximation, as well as on the calculation of the first nonlinear correction to the solution of the linearized Heisenberg equation which satisfies the canonical commutation relations at the next order. It is shown that, at least in the case of free quasi-particles, consideration of the nonlinear correction automatically solves the problem of nonconserved particle number, which is inherent to the original approach.',
@@ -209,7 +212,7 @@ def test_get_matches_when_prev_match_exist(self, mock_query_eprint_bibstem, mock
             'confidence': 0.9
         }
 
-        match = get_matches(source_bibcode, abstract, title, author, year, None, matched_docs)
+        match = get_matches(source_bibcode, doctype, abstract, title, author, year, None, matched_docs)
         self.assertEqual(len(match), 1)
         self.assertDictEqual(match[0], {'source_bibcode': '2022arXiv220606316S',
                                         'matched_bibcode': '2022CSF...27421615S',
@@ -217,6 +220,55 @@ def test_get_matches_when_prev_match_exist(self, mock_query_eprint_bibstem, mock
                                         'matched': 1,
                                         'scores': {}})
 
+    @mock.patch('oraclesrv.score.get_a_record')
+    @mock.patch('oraclesrv.utils.query_eprint_bibstem')
+    def test_get_matches_when_prev_match_exist_source_pub(self, mock_query_eprint_bibstem, mock_get_a_record):
+        """
+        Test get_matches function of the score module when there is a prev match and source bibcode is pub
+        """
+        # mock the eprint_bibstem patterns
+        mock_query_eprint_bibstem.return_value = (
+            [
+                {'name': 'arXiv', 'pattern': r'^(\d\d\d\d(?:arXiv|acc\.phys|adap\.org|alg\.geom|ao\.sci|astro\.ph|atom\.ph|bayes\.an|chao\.dyn|chem\.ph|cmp\.lg|comp\.gas|cond\.mat|cs\.|dg\.ga|funct\.an|gr\.qc|hep\.ex|hep\.lat|hep\.ph|hep\.th|math\.|math\.ph|mtrl\.th|nlin\.|nucl\.ex|nucl\.th|patt\.sol|physics\.|plasm\.ph|q\.alg|q\.bio|quant\.ph|solv\.int|supr\.con))'},
+                {'name': 'Earth Science', 'pattern': r'^(\d\d\d\d(?:EaArX|esoar))'},
+            ],
+            200
+        )
+
+        source_bibcode = '2021CSF...15311505S'
+        abstract = 'In the present paper, discussion of the canonical quantization of a weakly nonideal Bose gas at zero temperature along the lines of the famous Bogolyubov approach is continued. Contrary to the previous paper on this subject, here the two-body interaction potential is considered in the general form. It is shown that consideration of the first nonlinear correction automatically solves the problem of nonconserved particle number, which is inherent to the original approach, without any modification of the resulting effective Hamiltonian.'
+        title = 'Nonlinear corrections in the quantization of a weakly nonideal Bose gas   at zero temperature. II. The general case'
+        author = 'Smolyakov, Mikhail N.'
+        year = 2022
+        doi = ['10.1016/j.chaos.2021.111505']
+        doctype = 'article'
+
+        matched_docs = [{'bibcode': '2022arXiv220606316S',
+                         'abstract': 'In the present paper, quantization of a weakly nonideal Bose gas at zero temperature along the lines of the well-known Bogolyubov approach is performed. The analysis presented in this paper is based, in addition to the steps of the original Bogolyubov approach, on the use of nonoscillation modes (which are also solutions of the linearized Heisenberg equation) for recovering the canonical commutation relations in the linear approximation, as well as on the calculation of the first nonlinear correction to the solution of the linearized Heisenberg equation which satisfies the canonical commutation relations at the next order. It is shown that, at least in the case of free quasi-particles, consideration of the nonlinear correction automatically solves the problem of nonconserved particle number, which is inherent to the original approach.',
+                         'author_norm': ['Smolyakov, M'],
+                         'doctype': 'article',
+                         'doi': ['10.1016/j.chaos.2021.111505'],
+                         'identifier': ['10.1016/j.chaos.2021.111505', 'arXiv:2103.12030', '2021CSF...15311505S', '2021arXiv210312030S'],
+                         'title': ['Nonlinear corrections in the quantization of a weakly nonideal Bose gas at zero temperature'],
+                         'year': '2021',
+                         'property': ['ARTICLE','EPRINT_OPENACCESS','ESOURCE','OPENACCESS','REFEREED']}]
+
+
+        # mock the previous match with higher confidence
+        mock_get_a_record.return_value = {
+            'eprint_bibcode': '2022arXiv220606316S',
+            'pub_bibcode': '2022CSF...27421615S',
+            'confidence': 0.9
+        }
+
+        match = get_matches(source_bibcode, doctype, abstract, title, author, year, None, matched_docs)
+        self.assertEqual(len(match), 1)
+        self.assertDictEqual(match[0], {'source_bibcode': '2022CSF...27421615S',
+                                        'matched_bibcode': '2022arXiv220606316S',
+                                        'confidence': 0.9,
+                                        'matched': 1,
+                                        'scores': {}})
+
     def test_get_matches_when_source_and_match_equal(self):
         """
         Test get_matches function of the score module when source bibcode and match bibcode are the same
@@ -227,6 +279,7 @@ def test_get_matches_when_source_and_match_equal(self):
         author = 'Smolyakov, Mikhail N.'
         year = 2022
         doi = ['10.1016/j.chaos.2021.111505']
+        doctype = 'eprint'
 
         # when match and source are the same
         matched_docs = [{'bibcode': '2022arXiv220606316S',
@@ -239,7 +292,7 @@ def test_get_matches_when_source_and_match_equal(self):
                          'year': '2022',
                          'property': ['ARTICLE', 'ESOURCE', 'NOT REFEREED']}
                         ]
-        match = get_matches(source_bibcode, abstract, title, author, year, None, matched_docs)
+        match = get_matches(source_bibcode, doctype, abstract, title, author, year, None, matched_docs)
         self.assertEqual(match, [])
 
     @mock.patch('oraclesrv.utils.query_eprint_bibstem')
@@ -262,6 +315,7 @@ def test_get_match_for_pub_with_doi(self, mock_query_eprint_bibstem):
         author = 'Panda, Swayamtrupta; Dias dos Santos, Denimara'
         year = 2022
         doi = ['10.31059/aat.vol3.iss1.pp27-34']
+        doctype = 'eprint'
 
         matched_docs = [{'bibcode': '2021arXiv211101521P',
                          'abstract': 'The CaFe Project involves the study of the properties of the low ionization emission lines (LILs) pertaining to the broad-line region (BLR) in active galaxies. These emission lines, especially the singly-ionized iron (Fe II) in the optical and the corresponding singly-ionized calcium (Ca II) in the near-infrared (NIR) are found to show a strong correlation in their emission strengths, i.e. with respect to the broad H$\\beta$ emission line, the latter also belonging to the same category of LILs. The origin of this correlation is attributed to the similarity in the physical conditions necessary to emit these lines - especially in terms of the strength of the ionization from the central continuum source and the local number density of available matter in these regions. In this paper, we focus on the issue of the spectral energy distribution (SED) characteristic to a prototypical Type-1 Narrow-line Seyfert galaxy (NLS1) - I Zw 1. We extract the continuum from quasi-simultaneous spectroscopic measurements ranging from the near-UV ($\\sim$1200A) to the near-infrared ($\\sim$24000A) to construct the SED and supplement it with archival X-ray measurements available for this source. Using the photoionization code CLOUDY, we assess and compare the contribution of the prominent \"Big Blue Bump\" seen in our SED versus the SED used in our previous work, wherein the latter was constructed from archival, multi-epoch photometric measurements. Following the prescription from our previous work, we constrain the physical parameter space to optimize the emission from these LILs and discuss the implication of the use of a \"better\" SED.',
@@ -274,7 +328,7 @@ def test_get_match_for_pub_with_doi(self, mock_query_eprint_bibstem):
                          'doi_pubnote': '10.31059/aat.vol3.iss1.pp27-34'}]
 
         # abstract, no doi
-        match = get_matches(source_bibcode, abstract, title, author, year, doi, matched_docs)
+        match = get_matches(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs)
         self.assertEqual(len(match), 1)
         self.assertDictEqual(match[0], {'source_bibcode': '2022AcAT....3a..27P',
                                         'matched_bibcode': '2021arXiv211101521P',
@@ -540,9 +594,12 @@ def test_query_abstract_or_title(self):
                 mock.patch('oraclesrv.doc_matching.get_matches', return_value=[]), \
                 mock.patch.object(self.current_app.logger, 'debug') as mock_debug:
 
+            # twice calling,
+            # first some results with abstract, but no match
+            # second no results with title
             mock_get_solr_data_match.side_effect = [
-                ([{'bibcode': '2000Bibcode.......A'}], 'mock_query_with_abstract', 200),  # some results with abstract, but no match
-                ([], 'mock_query_with_abstract', 400)  # no results with title
+                ([{'bibcode': '2000Bibcode.......A'}], 'mock_query_with_abstract', 200),
+                ([], 'mock_query_with_abstract', 400)
             ]
 
             result = doc_match.query_abstract_or_title(comment)
@@ -706,11 +763,12 @@ def test_get_doi_match(self):
         author = 'Smolyakov, Mikhail N.'
         year = 2022
         doi = ['10.1016/j.chaos.2021.111505']
+        doctype = 'eprint'
 
         # when there are no matches with doi
         with mock.patch('oraclesrv.score.get_doi_match') as mock_get_doi_match:
             mock_get_doi_match.return_value = None
-            self.assertEqual(get_doi_match(source_bibcode, abstract, title, author, year, doi, matched_docs=[]), [])
+            self.assertEqual(get_doi_match(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs=[]), [])
 
         matched_docs = [{'bibcode': '2021CSF...15311505S',
                          'abstract': 'In the present paper, quantization of a weakly nonideal Bose gas at zero temperature along the lines of the well-known Bogolyubov approach is performed. The analysis presented in this paper is based, in addition to the steps of the original Bogolyubov approach, on the use of nonoscillation modes (which are also solutions of the linearized Heisenberg equation) for recovering the canonical commutation relations in the linear approximation, as well as on the calculation of the first nonlinear correction to the solution of the linearized Heisenberg equation which satisfies the canonical commutation relations at the next order. It is shown that, at least in the case of free quasi-particles, consideration of the nonlinear correction automatically solves the problem of nonconserved particle number, which is inherent to the original approach.',
@@ -731,7 +789,7 @@ def test_get_doi_match(self):
         # when there are more than one matches with doi
         with mock.patch('oraclesrv.score.get_doi_match') as mock_get_doi_match:
             mock_get_doi_match.return_value = matched_docs
-            self.assertEqual(get_doi_match(source_bibcode, abstract, title, author, year, doi, matched_docs=[]), [])
+            self.assertEqual(get_doi_match(source_bibcode, doctype, abstract, title, author, year, doi, matched_docs=[]), [])
 
     def test_get_solr_data_match(self):
         """
diff --git a/oraclesrv/utils.py b/oraclesrv/utils.py
index 3c83cf3..57dd5a9 100644
--- a/oraclesrv/utils.py
+++ b/oraclesrv/utils.py
@@ -672,3 +672,18 @@ def query_eprint_bibstem():
     except SQLAlchemyError as e:
         current_app.logger.error('SQLAlchemy: ' + str(e))
         return [], 404
+
+def is_eprint_bibcode(source_bibcode):
+    """
+    check if source_bibcode is an eprint_bibcode
+
+    :param source_bibcode:
+    :return:
+    """
+    eprint_bibstems, _ = query_eprint_bibstem()
+
+    for bibstem in eprint_bibstems:
+        if bibstem['name'] in ['arXiv', 'Earth Science'] and re.search(bibstem['pattern'], source_bibcode):
+            return True
+
+    return False