Skip to content

Commit 21e8910

Browse files
authored
Adding fuzzy=false to find_source_in_db and use in ingest_source (#116)
Tests: * add some data to be used in all tests in conftest * add LHS292 / LHS2924 fuzzy test * specific reference name since Github Actions does not have an ADS_TOKEN ingest_source: * check ref first in 'ingest_source' * remove None default for reference since it's required. find_in_db: * implement fuzzy=false as default ingest_publication: * fix missing using. Closes #106 * polish using * trying to decrease complexity of ingest_publication
1 parent 3adf9ed commit 21e8910

File tree

5 files changed

+108
-82
lines changed

5 files changed

+108
-82
lines changed

astrodb_utils/publications.py

Lines changed: 60 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -268,18 +268,23 @@ def ingest_publication(
268268
return
269269

270270
if not ignore_ads:
271-
use_ads = check_ads_token()
272-
if not use_ads and (not reference and (not doi or not bibcode)):
273-
logger.error(
274-
"An ADS_TOKEN environment variable must be set"
275-
"in order to auto-populate the fields.\n"
276-
"Without an ADS_TOKEN, name and bibcode or DOI must be set explicity."
277-
)
278-
return
279-
else:
280-
use_ads = False
271+
ads_token = check_ads_token()
281272

282-
logger.debug(f"Use ADS set to {use_ads}")
273+
if not ads_token:
274+
logger.warning(
275+
"An ADS_TOKEN environment variable is not set.\n"
276+
"setting ignore_ads=True.")
277+
ignore_ads = True
278+
279+
if (not reference and (not doi or not bibcode)):
280+
logger.error(
281+
"An ADS_TOKEN environment variable must be set"
282+
"in order to auto-populate the fields.\n"
283+
"Without an ADS_TOKEN, name and bibcode or DOI must be set explicity."
284+
)
285+
return
286+
287+
logger.debug(f"ignore_ads set to {ignore_ads}")
283288

284289
if bibcode:
285290
if "arXiv" in bibcode:
@@ -291,40 +296,15 @@ def ingest_publication(
291296
arxiv_id = None
292297

293298
name_add, bibcode_add, doi_add = "", "", ""
294-
# Search ADS uing a provided arxiv id
295-
if arxiv_id and use_ads:
296-
arxiv_matches = ads.SearchQuery(
297-
q=arxiv_id, fl=["id", "bibcode", "title", "first_author", "year", "doi"]
298-
)
299-
arxiv_matches_list = list(arxiv_matches)
300-
if len(arxiv_matches_list) != 1:
301-
logger.error("should only be one matching arxiv id")
302-
return
303-
304-
if len(arxiv_matches_list) == 1:
305-
logger.debug(f"Publication found in ADS using arxiv id: , {arxiv_id}")
306-
article = arxiv_matches_list[0]
307-
logger.debug(
308-
f"{article.first_author}, {article.year}, {article.bibcode}, {article.title}"
309-
)
310-
if not reference: # generate the name if it was not provided
311-
name_stub = article.first_author.replace(",", "").replace(" ", "")
312-
name_add = name_stub[0:4] + article.year[-2:]
313-
else:
314-
name_add = reference
315-
description = article.title[0]
316-
bibcode_add = article.bibcode
317-
doi_add = article.doi[0]
299+
using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}"
318300

319-
using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}"
320-
elif arxiv_id:
321-
name_add = reference
322-
bibcode_add = arxiv_id
323-
doi_add = doi
324-
using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}"
301+
# Search ADS uing a provided arxiv id
302+
if arxiv_id:
303+
name_add, bibcode_add, doi_add, description = find_pub_using_arxiv_id(arxiv_id, reference, doi, ignore_ads)
304+
using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}"
325305

326306
# Search ADS using a provided DOI
327-
if doi and use_ads:
307+
if doi and not ignore_ads:
328308
doi_matches = ads.SearchQuery(
329309
doi=doi, fl=["id", "bibcode", "title", "first_author", "year", "doi"]
330310
)
@@ -335,7 +315,6 @@ def ingest_publication(
335315

336316
if len(doi_matches_list) == 1:
337317
logger.debug(f"Publication found in ADS using DOI: {doi}")
338-
using = doi
339318
article = doi_matches_list[0]
340319
logger.debug(
341320
f"{article.first_author}, {article.year},"
@@ -349,12 +328,14 @@ def ingest_publication(
349328
description = article.title[0]
350329
bibcode_add = article.bibcode
351330
doi_add = article.doi[0]
331+
using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}"
352332
elif doi:
353333
name_add = reference
354334
bibcode_add = bibcode
355335
doi_add = doi
336+
using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}"
356337

357-
if bibcode and use_ads:
338+
if bibcode and not ignore_ads:
358339
bibcode_matches = ads.SearchQuery(
359340
bibcode=bibcode,
360341
fl=["id", "bibcode", "title", "first_author", "year", "doi"],
@@ -370,7 +351,6 @@ def ingest_publication(
370351

371352
elif len(bibcode_matches_list) == 1:
372353
logger.debug(f"Publication found in ADS using bibcode: {bibcode}")
373-
using = str(bibcode)
374354
article = bibcode_matches_list[0]
375355
logger.debug(
376356
f"{article.first_author}, {article.year}, "
@@ -387,6 +367,7 @@ def ingest_publication(
387367
doi_add = None
388368
else:
389369
doi_add = article.doi[0]
370+
using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}"
390371
elif bibcode:
391372
name_add = reference
392373
bibcode_add = bibcode
@@ -395,7 +376,7 @@ def ingest_publication(
395376

396377
if reference and not bibcode and not doi:
397378
name_add = reference
398-
using = "user input"
379+
using = "ref: {reference} user input. No bibcode or doi provided."
399380

400381
new_ref = [
401382
{
@@ -438,3 +419,36 @@ def check_ads_token():
438419

439420
return use_ads
440421

422+
423+
def find_pub_using_arxiv_id(arxiv_id, reference, doi, ignore_ads):
424+
if not ignore_ads:
425+
arxiv_matches = ads.SearchQuery(
426+
q=arxiv_id, fl=["id", "bibcode", "title", "first_author", "year", "doi"]
427+
)
428+
arxiv_matches_list = list(arxiv_matches)
429+
if len(arxiv_matches_list) != 1:
430+
logger.error("should only be one matching arxiv id")
431+
return
432+
433+
if len(arxiv_matches_list) == 1:
434+
logger.debug(f"Publication found in ADS using arxiv id: , {arxiv_id}")
435+
article = arxiv_matches_list[0]
436+
logger.debug(
437+
f"{article.first_author}, {article.year}, {article.bibcode}, {article.title}"
438+
)
439+
if not reference: # generate the name if it was not provided
440+
name_stub = article.first_author.replace(",", "").replace(" ", "")
441+
name_add = name_stub[0:4] + article.year[-2:]
442+
else:
443+
name_add = reference
444+
description = article.title[0]
445+
bibcode_add = article.bibcode
446+
doi_add = article.doi[0]
447+
448+
else:
449+
name_add = reference
450+
bibcode_add = arxiv_id
451+
doi_add = doi
452+
description = None
453+
454+
return name_add, bibcode_add, doi_add, description

astrodb_utils/sources.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,11 @@ def find_source_in_db(
2828
ra_col_name="ra_deg",
2929
dec_col_name="dec_deg",
3030
use_simbad=True,
31+
fuzzy=False
3132
):
3233
"""
3334
Find a source in the database given a source name and optional coordinates.
35+
Uses astrodbkit .search_object and .query_region methods to search the Sources and Names table.
3436
3537
Parameters
3638
----------
@@ -50,6 +52,8 @@ def find_source_in_db(
5052
use_simbad: bool
5153
Use Simbad to resolve the source name if it is not found in the database. Default is True.
5254
Set to False if no internet connection.
55+
fuzzy: bool
56+
Use fuzzy search to find source name in database. Default is False.
5357
5458
Returns
5559
-------
@@ -74,7 +78,7 @@ def find_source_in_db(
7478

7579
# NO MATCHES
7680
# If no matches, try fuzzy search
77-
if len(db_name_matches) == 0:
81+
if len(db_name_matches) == 0 and fuzzy:
7882
logger.debug(f"{source}: No name matches, trying fuzzy search")
7983
db_name_matches = db.search_object(
8084
source,
@@ -224,8 +228,8 @@ def ingest_names(
224228
def ingest_source(
225229
db,
226230
source,
231+
reference: str,
227232
*,
228-
reference: str = None,
229233
ra: float = None,
230234
dec: float = None,
231235
epoch: str = None,
@@ -282,6 +286,19 @@ def ingest_source(
282286

283287
logger.debug(f"Trying to ingest source: {source}")
284288

289+
# Make sure reference is provided and in References table
290+
ref_check = find_publication(db, reference=reference)
291+
logger.debug(f"ref_check: {ref_check}")
292+
if ref_check[0] is False:
293+
msg = (
294+
f"Skipping: {source}."
295+
f"Discovery reference {reference} is either missing or "
296+
" is not in Publications table. \n"
297+
f"(Add it with ingest_publication function.)"
298+
)
299+
exit_function(msg, raise_error)
300+
return
301+
285302
# Find out if source is already in database or not
286303
if search_db:
287304
logger.debug(f"Checking database for: {source} at ra: {ra}, dec: {dec}")
@@ -316,19 +333,6 @@ def ingest_source(
316333
exit_function(msg1 + msg2, raise_error)
317334
return
318335

319-
# Make sure reference is provided and in References table
320-
ref_check = find_publication(db, reference=reference)
321-
logger.debug(f"ref_check: {ref_check}")
322-
if ref_check[0] is False:
323-
msg = (
324-
f"Skipping: {source}."
325-
f"Discovery reference {reference} is either missing or "
326-
" is not in Publications table. \n"
327-
f"(Add it with ingest_publication function.)"
328-
)
329-
exit_function(msg, raise_error)
330-
return
331-
332336
# Try to get coordinates from SIMBAD if they were not provided
333337
if (ra is None or dec is None) and use_simbad:
334338
# Try to get coordinates from SIMBAD

astrodb_utils/tests/conftest.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import pytest
55

66
from astrodb_utils import load_astrodb, logger
7+
from astrodb_utils.publications import ingest_publication
8+
from astrodb_utils.sources import ingest_source
79

810
logger.setLevel("DEBUG")
911

@@ -24,6 +26,22 @@ def db():
2426
# Confirm file was created
2527
assert os.path.exists(DB_NAME)
2628

27-
logger.info("Loaded SIMPLE database using load_astrodb function in conftest.py")
29+
logger.info("Loaded AstroDB Template database using load_astrodb function in conftest.py")
30+
31+
32+
33+
ingest_publication(
34+
db,
35+
reference="Refr20",
36+
bibcode="2020MNRAS.496.1922B",
37+
doi="10.1093/mnras/staa1522",
38+
ignore_ads=True,
39+
)
40+
41+
ingest_publication(db, doi="10.1086/161442", reference="Prob83")
42+
43+
ingest_source(db, "LHS 2924", reference="Prob83")
44+
45+
return db
46+
2847

29-
return db

astrodb_utils/tests/test_publications.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,6 @@
44
from astrodb_utils.publications import find_publication, ingest_publication
55

66

7-
def test_ingest_publications(db):
8-
# add a made up publication and make sure it's there
9-
ingest_publication(
10-
db,
11-
reference="Refr20",
12-
bibcode="2020MNRAS.496.1922B",
13-
doi="10.1093/mnras/staa1522",
14-
ignore_ads=True,
15-
)
16-
assert (
17-
db.query(db.Publications)
18-
.filter(db.Publications.c.reference == "Refr20")
19-
.count()
20-
== 1
21-
)
22-
23-
247
def test_find_publication(db):
258
assert not find_publication(db)[0] # False
269
assert find_publication(db, reference="Refr20")[0] # True

astrodb_utils/tests/test_sources.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111
ingest_source,
1212
)
1313

14-
# TODO: Ingest publication just for these tests so they can be run independent of test_publications.py
15-
1614

1715
@pytest.mark.parametrize(
1816
"source_data",
@@ -90,6 +88,15 @@ def test_find_source_in_db(db):
9088
)
9189
assert len(search_result) == 0
9290

91+
search_result = find_source_in_db(db,"LHS 2924")
92+
assert search_result[0] == "LHS 2924"
93+
94+
search_result = find_source_in_db(db,"LHS 292", fuzzy=False)
95+
assert len(search_result) == 0
96+
97+
search_result = find_source_in_db(db,"LHS 292", fuzzy=True)
98+
assert search_result[0] == "LHS 2924" # This is wrong and a result of fuzzy matching
99+
93100

94101
def test_find_source_in_db_errors(db):
95102
with pytest.raises(KeyError) as error_message:

0 commit comments

Comments
 (0)