From b4e9250d192dc073e924b6ed5c123e415fb2fec2 Mon Sep 17 00:00:00 2001 From: kelle Date: Mon, 24 Mar 2025 17:33:05 -0700 Subject: [PATCH 01/15] fix arxiv_id bug --- astrodb_utils/publications.py | 10 ++++++---- astrodb_utils/tests/test_publications.py | 21 ++++++++++++++++++++- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/astrodb_utils/publications.py b/astrodb_utils/publications.py index 36c9b86..9da2afb 100644 --- a/astrodb_utils/publications.py +++ b/astrodb_utils/publications.py @@ -300,6 +300,7 @@ def ingest_publication( # Search ADS uing a provided arxiv id if arxiv_id: + logger.debug(f"Searching ADS using arxiv id: {arxiv_id}, reference: {reference}, doi: {doi}, ignore_ads: {ignore_ads}") name_add, bibcode_add, doi_add, description = find_pub_using_arxiv_id(arxiv_id, reference, doi, ignore_ads) using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}" @@ -434,16 +435,17 @@ def find_pub_using_arxiv_id(arxiv_id, reference, doi, ignore_ads): logger.debug(f"Publication found in ADS using arxiv id: , {arxiv_id}") article = arxiv_matches_list[0] logger.debug( - f"{article.first_author}, {article.year}, {article.bibcode}, {article.title}" + f"{article.first_author}, {article.year}, {article.bibcode}, {article.doi}, {article.title}" ) if not reference: # generate the name if it was not provided name_stub = article.first_author.replace(",", "").replace(" ", "") name_add = name_stub[0:4] + article.year[-2:] else: name_add = reference - description = article.title[0] - bibcode_add = article.bibcode - doi_add = article.doi[0] + + description = article.title[0] + bibcode_add = article.bibcode + doi_add = article.doi[0] else: name_add = reference diff --git a/astrodb_utils/tests/test_publications.py b/astrodb_utils/tests/test_publications.py index bd2d50f..b3a60bd 100644 --- a/astrodb_utils/tests/test_publications.py +++ b/astrodb_utils/tests/test_publications.py @@ -1,7 +1,11 @@ import pytest from astrodb_utils import AstroDBError -from astrodb_utils.publications import find_publication, ingest_publication +from astrodb_utils.publications import ( + find_publication, + ingest_publication, + find_pub_using_arxiv_id, +) def test_find_publication(db): @@ -39,3 +43,18 @@ def test_ingest_publication_errors(db): assert " similar publication already exists" in str(error_message.value) # TODO - Mock environment where ADS_TOKEN is not set. #117 + +def test_ingest_publication(db): + ingest_publication(db, bibcode="2023arXiv230812107B") + + assert find_publication(db, reference="Burg24")[0] # True + +def test_find_pub_using_arxix_id(db): + name_add, bibcode_add, doi_add, description = find_pub_using_arxiv_id( + "2023arXiv230812107B", reference=None, doi=None, ignore_ads=False + ) + + assert name_add == "Burg24" + assert bibcode_add == "2024ApJ...962..177B" + assert doi_add == "10.3847/1538-4357/ad206f" + assert description == "UNCOVER: JWST Spectroscopy of Three Cold Brown Dwarfs at Kiloparsec-scale Distances" From b0675cb8dedbc056a27ba65d69f14cda0a6bdd4e Mon Sep 17 00:00:00 2001 From: kelle Date: Mon, 24 Mar 2025 23:47:54 -0700 Subject: [PATCH 02/15] trying to reduce complexity --- astrodb_utils/publications.py | 115 +++++++++++------------ astrodb_utils/tests/test_publications.py | 16 +++- 2 files changed, 68 insertions(+), 63 deletions(-) diff --git a/astrodb_utils/publications.py b/astrodb_utils/publications.py index 9da2afb..31e973f 100644 --- a/astrodb_utils/publications.py +++ b/astrodb_utils/publications.py @@ -140,23 +140,7 @@ def find_publication( if logger.level == 10: # debug pub_search_table.pprint_all() - # Try to find numbers in the reference which might be a date - dates = re.findall(r"\d+", reference) - # try to find a two digit date - if len(dates) == 0: - logger.debug(f"Could not find a date in {reference}") - two_digit_date = None - elif len(dates) == 1: - if len(dates[0]) == 4: - two_digit_date = dates[0][2:] - elif len(dates[0]) == 2: - two_digit_date = dates[0] - else: - logger.debug(f"Could not find a two digit date using {dates}") - two_digit_date = None - else: - logger.debug(f"Could not find a two digit date using {dates}") - two_digit_date = None + two_digit_date = find_dates_in_reference(reference) if two_digit_date: logger.debug(f"Trying to limit using {two_digit_date}") @@ -182,38 +166,31 @@ def find_publication( return False, n_pubs_found_short_date else: return False, n_pubs_found_short + if n_pubs_found == 0 and bibcode and "arXiv" in bibcode and use_ads: logger.debug(f"Using ADS to find alt name for {bibcode}") - arxiv_id = bibcode - arxiv_matches = ads.SearchQuery( - q=arxiv_id, fl=["id", "bibcode", "title", "first_author", "year", "doi"] + results = find_pub_using_arxiv_id( + bibcode, reference=None, doi=None, ignore_ads=~use_ads ) - arxiv_matches_list = list(arxiv_matches) - if len(arxiv_matches_list) == 1: - logger.debug(f"Publication found in ADS using arxiv id: , {arxiv_id}") - article = arxiv_matches_list[0] + bibcode_alt = results[1] + not_null_pub_filters = [] + not_null_pub_filters.append(db.Publications.c.bibcode.ilike(bibcode_alt)) + print(not_null_pub_filters) + pub_search_table = Table() + pub_search_table = ( + db.query(db.Publications).filter(or_(*not_null_pub_filters)).table() + ) + if len(pub_search_table) == 1: logger.debug( - f"{article.first_author}, {article.year}, {article.bibcode}, {article.title}" + f"Found {len(pub_search_table)} matching publications for " + f"{reference} or {doi} or {bibcode}: {pub_search_table['reference'].data}" ) - bibcode_alt = article.bibcode - not_null_pub_filters = [] - not_null_pub_filters.append(db.Publications.c.bibcode.ilike(bibcode_alt)) - print(not_null_pub_filters) - pub_search_table = Table() - pub_search_table = ( - db.query(db.Publications).filter(or_(*not_null_pub_filters)).table() - ) - if len(pub_search_table) == 1: - logger.debug( - f"Found {len(pub_search_table)} matching publications for " - f"{reference} or {doi} or {bibcode}: {pub_search_table['reference'].data}" - ) - if logger.level <= 10: # debug - pub_search_table.pprint_all() - - return True, pub_search_table["reference"].data[0] - else: - return False, len(pub_search_table) + if logger.level <= 10: # debug + pub_search_table.pprint_all() + + return True, pub_search_table["reference"].data[0] + else: + return False, len(pub_search_table) else: return False, n_pubs_found @@ -267,23 +244,15 @@ def ingest_publication( logger.error("Publication, DOI, or Bibcode is required input") return - if not ignore_ads: - ads_token = check_ads_token() - - if not ads_token: - logger.warning( - "An ADS_TOKEN environment variable is not set.\n" - "setting ignore_ads=True.") - ignore_ads = True - - if (not reference and (not doi or not bibcode)): - logger.error( - "An ADS_TOKEN environment variable must be set" - "in order to auto-populate the fields.\n" - "Without an ADS_TOKEN, name and bibcode or DOI must be set explicity." - ) - return - + if ignore_ads is False and check_ads_token() is False: + ignore_ads = True + if (not reference and (not doi or not bibcode)): + logger.error( + "An ADS_TOKEN environment variable must be set" + "in order to auto-populate the fields.\n" + "Without an ADS_TOKEN, name and bibcode or DOI must be set explicity." + ) + return logger.debug(f"ignore_ads set to {ignore_ads}") if bibcode: @@ -416,6 +385,9 @@ def check_ads_token(): if ads.config.token: use_ads = True else: + logger.warning( + "An ADS_TOKEN environment variable is not set.\n" + "setting ignore_ads=True/use_ads=False") use_ads = False return use_ads @@ -454,3 +426,24 @@ def find_pub_using_arxiv_id(arxiv_id, reference, doi, ignore_ads): description = None return name_add, bibcode_add, doi_add, description + +def find_dates_in_reference(reference): + # Try to find numbers in the reference which might be a date + dates = re.findall(r"\d+", reference) + # try to find a two digit date + if len(dates) == 0: + logger.debug(f"Could not find a date in {reference}") + two_digit_date = None + elif len(dates) == 1: + if len(dates[0]) == 4: + two_digit_date = dates[0][2:] + elif len(dates[0]) == 2: + two_digit_date = dates[0] + else: + logger.debug(f"Could not find a two digit date using {dates}") + two_digit_date = None + else: + logger.debug(f"Could not find a two digit date using {dates}") + two_digit_date = None + + return two_digit_date \ No newline at end of file diff --git a/astrodb_utils/tests/test_publications.py b/astrodb_utils/tests/test_publications.py index b3a60bd..d77f222 100644 --- a/astrodb_utils/tests/test_publications.py +++ b/astrodb_utils/tests/test_publications.py @@ -2,9 +2,10 @@ from astrodb_utils import AstroDBError from astrodb_utils.publications import ( + find_dates_in_reference, + find_pub_using_arxiv_id, find_publication, ingest_publication, - find_pub_using_arxiv_id, ) @@ -26,6 +27,7 @@ def test_find_publication(db): assert find_publication(db, reference=None) == (False, 0) + #find_publication(db,bibcode="2022arXiv220800211G" ) @pytest.mark.skip(reason="Fuzzy matching not perfect yet. #27") # TODO: find publication only finds one of the Gaia publications @@ -46,9 +48,9 @@ def test_ingest_publication_errors(db): def test_ingest_publication(db): ingest_publication(db, bibcode="2023arXiv230812107B") - assert find_publication(db, reference="Burg24")[0] # True + def test_find_pub_using_arxix_id(db): name_add, bibcode_add, doi_add, description = find_pub_using_arxiv_id( "2023arXiv230812107B", reference=None, doi=None, ignore_ads=False @@ -58,3 +60,13 @@ def test_find_pub_using_arxix_id(db): assert bibcode_add == "2024ApJ...962..177B" assert doi_add == "10.3847/1538-4357/ad206f" assert description == "UNCOVER: JWST Spectroscopy of Three Cold Brown Dwarfs at Kiloparsec-scale Distances" + + results = find_pub_using_arxiv_id("2022arXiv220800211G", reference=None, doi=None, ignore_ads=False) + print(results) + assert results[0] == "Gaia23" + assert results[1] == "2023A&A...674A...1G" + + +def test_find_dates_in_reference(): + assert find_dates_in_reference("Wright_2010") == "10" + assert find_dates_in_reference("Refr20") == "20" From 036d41a2b441140852ee79afdbe066324d1c5935 Mon Sep 17 00:00:00 2001 From: kelle Date: Mon, 24 Mar 2025 23:54:28 -0700 Subject: [PATCH 03/15] skip tests that need ADS --- astrodb_utils/tests/test_publications.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/astrodb_utils/tests/test_publications.py b/astrodb_utils/tests/test_publications.py index d77f222..497277c 100644 --- a/astrodb_utils/tests/test_publications.py +++ b/astrodb_utils/tests/test_publications.py @@ -46,11 +46,13 @@ def test_ingest_publication_errors(db): # TODO - Mock environment where ADS_TOKEN is not set. #117 +@pytest.mark.skip(reason="Need to set up mock environment for ADS_TOKEN") def test_ingest_publication(db): ingest_publication(db, bibcode="2023arXiv230812107B") assert find_publication(db, reference="Burg24")[0] # True +@pytest.mark.skip(reason="Need to set up mock environment for ADS_TOKEN") def test_find_pub_using_arxix_id(db): name_add, bibcode_add, doi_add, description = find_pub_using_arxiv_id( "2023arXiv230812107B", reference=None, doi=None, ignore_ads=False From 202a363daa3624b376141ec964dd5c323622d36b Mon Sep 17 00:00:00 2001 From: kelle Date: Mon, 24 Mar 2025 23:58:36 -0700 Subject: [PATCH 04/15] lint --- astrodb_utils/tests/test_publications.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/astrodb_utils/tests/test_publications.py b/astrodb_utils/tests/test_publications.py index 497277c..6654c22 100644 --- a/astrodb_utils/tests/test_publications.py +++ b/astrodb_utils/tests/test_publications.py @@ -27,7 +27,8 @@ def test_find_publication(db): assert find_publication(db, reference=None) == (False, 0) - #find_publication(db,bibcode="2022arXiv220800211G" ) + # find_publication(db,bibcode="2022arXiv220800211G" ) + @pytest.mark.skip(reason="Fuzzy matching not perfect yet. #27") # TODO: find publication only finds one of the Gaia publications @@ -61,9 +62,14 @@ def test_find_pub_using_arxix_id(db): assert name_add == "Burg24" assert bibcode_add == "2024ApJ...962..177B" assert doi_add == "10.3847/1538-4357/ad206f" - assert description == "UNCOVER: JWST Spectroscopy of Three Cold Brown Dwarfs at Kiloparsec-scale Distances" + assert ( + description + == "UNCOVER: JWST Spectroscopy of Three Cold Brown Dwarfs at Kiloparsec-scale Distances" + ) - results = find_pub_using_arxiv_id("2022arXiv220800211G", reference=None, doi=None, ignore_ads=False) + results = find_pub_using_arxiv_id( + "2022arXiv220800211G", reference=None, doi=None, ignore_ads=False + ) print(results) assert results[0] == "Gaia23" assert results[1] == "2023A&A...674A...1G" From 2906cef1f6c7d07ccd53720912ec80e07a2688b3 Mon Sep 17 00:00:00 2001 From: kelle Date: Tue, 25 Mar 2025 22:12:56 -0700 Subject: [PATCH 05/15] decrease complexity of ingest_publication --- astrodb_utils/publications.py | 202 +++++++++-------------- astrodb_utils/tests/test_publications.py | 59 +++++-- 2 files changed, 124 insertions(+), 137 deletions(-) diff --git a/astrodb_utils/publications.py b/astrodb_utils/publications.py index 31e973f..f5ca30e 100644 --- a/astrodb_utils/publications.py +++ b/astrodb_utils/publications.py @@ -1,6 +1,7 @@ import logging import os import re +from typing import Literal import ads import sqlalchemy.exc @@ -169,8 +170,8 @@ def find_publication( if n_pubs_found == 0 and bibcode and "arXiv" in bibcode and use_ads: logger.debug(f"Using ADS to find alt name for {bibcode}") - results = find_pub_using_arxiv_id( - bibcode, reference=None, doi=None, ignore_ads=~use_ads + results = search_ads( + bibcode, query_type="arxiv" ) bibcode_alt = results[1] not_null_pub_filters = [] @@ -199,6 +200,7 @@ def find_publication( def ingest_publication( db, + *, doi: str = None, bibcode: str = None, reference: str = None, @@ -226,11 +228,9 @@ def ingest_publication( (e.g, Xu__21 or LiYB21) description: str, optional Description of the paper, typically the title of the papre [optional] - ignore_ads: bool - - Returns - ------- - publication: str + ignore_ads: bool (default=False) + If True, do not use ADS to search for the publication [optional] + If False, an ADS_TOKEN environment variable must be set. See Also -------- @@ -244,6 +244,10 @@ def ingest_publication( logger.error("Publication, DOI, or Bibcode is required input") return + if reference and not bibcode and not doi: + name_add = reference + using = f"ref: {reference} user input. No bibcode or DOI provided." + if ignore_ads is False and check_ads_token() is False: ignore_ads = True if (not reference and (not doi or not bibcode)): @@ -255,99 +259,37 @@ def ingest_publication( return logger.debug(f"ignore_ads set to {ignore_ads}") - if bibcode: - if "arXiv" in bibcode: - arxiv_id = bibcode - bibcode = None - else: - arxiv_id = None - else: - arxiv_id = None + if ignore_ads is True and reference is None: + logger.error("A reference must be provided if ignore_ads is set to True") + return name_add, bibcode_add, doi_add = "", "", "" - using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}" - - # Search ADS uing a provided arxiv id - if arxiv_id: - logger.debug(f"Searching ADS using arxiv id: {arxiv_id}, reference: {reference}, doi: {doi}, ignore_ads: {ignore_ads}") - name_add, bibcode_add, doi_add, description = find_pub_using_arxiv_id(arxiv_id, reference, doi, ignore_ads) - using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}" - - # Search ADS using a provided DOI - if doi and not ignore_ads: - doi_matches = ads.SearchQuery( - doi=doi, fl=["id", "bibcode", "title", "first_author", "year", "doi"] - ) - doi_matches_list = list(doi_matches) - if len(doi_matches_list) != 1: - logger.error("should only be one matching DOI") - return - - if len(doi_matches_list) == 1: - logger.debug(f"Publication found in ADS using DOI: {doi}") - article = doi_matches_list[0] - logger.debug( - f"{article.first_author}, {article.year}," - f"{article.bibcode}, {article.title}" - ) - if not reference: # generate the name if it was not provided - name_stub = article.first_author.replace(",", "").replace(" ", "") - name_add = name_stub[0:4] + article.year[-2:] + using = f"ref: {name_add}, bibcode: {bibcode_add}, DOI: {doi_add}" + + if ignore_ads is False: + # Figure out how to search ADS + if doi: # Search ADS using a provided DOI + query_type = "doi" + value = doi + elif bibcode: + if "arXiv" in bibcode: + query_type = "arxiv" # Search ADS using an arXiv ID else: - name_add = reference - description = article.title[0] - bibcode_add = article.bibcode - doi_add = article.doi[0] - using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}" - elif doi: - name_add = reference - bibcode_add = bibcode - doi_add = doi - using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}" - - if bibcode and not ignore_ads: - bibcode_matches = ads.SearchQuery( - bibcode=bibcode, - fl=["id", "bibcode", "title", "first_author", "year", "doi"], - ) - bibcode_matches_list = list(bibcode_matches) - if len(bibcode_matches_list) == 0: - msg = f"Not a valid bibcode: {bibcode}" - raise AstroDBError(msg) - - elif len(bibcode_matches_list) > 1: - msg = f"Should only be one matching bibcode for: {bibcode}" - raise AstroDBError(msg) - - elif len(bibcode_matches_list) == 1: - logger.debug(f"Publication found in ADS using bibcode: {bibcode}") - article = bibcode_matches_list[0] - logger.debug( - f"{article.first_author}, {article.year}, " - f"{article.bibcode}, {article.doi}, {article.title}" - ) - if not reference: # generate the name if it was not provided - name_stub = article.first_author.replace(",", "").replace(" ", "") - name_add = name_stub[0:4] + article.year[-2:] - else: - name_add = reference - description = article.title[0] - bibcode_add = article.bibcode - if article.doi is None: - doi_add = None - else: - doi_add = article.doi[0] - using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}" - elif bibcode: + query_type = "bibcode" # Search ADS using a provided bibcode + value = bibcode + else: + logger.error("Unexpected error. No doi or bibcode provided") + return + + logger.debug(f"Searching ADS using {query_type}: {value}, reference: {reference}") + name_add, bibcode_add, doi_add, description = search_ads(value, query_type=query_type, reference=reference) + else: name_add = reference bibcode_add = bibcode doi_add = doi - using = f"ref: {name_add}, bibcode: {bibcode_add}, doi: {doi_add}" - - if reference and not bibcode and not doi: - name_add = reference - using = "ref: {reference} user input. No bibcode or doi provided." - + + using = f"ref: {name_add}, bibcode: {bibcode_add}, DOI: {doi_add}" + new_ref = [ { "reference": name_add, @@ -393,40 +335,58 @@ def check_ads_token(): return use_ads -def find_pub_using_arxiv_id(arxiv_id, reference, doi, ignore_ads): - if not ignore_ads: - arxiv_matches = ads.SearchQuery( - q=arxiv_id, fl=["id", "bibcode", "title", "first_author", "year", "doi"] - ) - arxiv_matches_list = list(arxiv_matches) - if len(arxiv_matches_list) != 1: - logger.error("should only be one matching arxiv id") - return +def search_ads(value: str, query_type: Literal["arxiv","bibcode","doi"], reference): + if check_ads_token() is False: + logger.error("An ADS_TOKEN environment variable must be set") + return - if len(arxiv_matches_list) == 1: - logger.debug(f"Publication found in ADS using arxiv id: , {arxiv_id}") - article = arxiv_matches_list[0] - logger.debug( - f"{article.first_author}, {article.year}, {article.bibcode}, {article.doi}, {article.title}" + if query_type == "arxiv": + ads_matches = ads.SearchQuery( + q=value, fl=["id", "bibcode", "title", "first_author", "year", "doi"] ) - if not reference: # generate the name if it was not provided - name_stub = article.first_author.replace(",", "").replace(" ", "") - name_add = name_stub[0:4] + article.year[-2:] - else: - name_add = reference + elif query_type == "bibcode": + ads_matches = ads.SearchQuery( + bibcode=value, + fl=["id", "bibcode", "title", "first_author", "year", "doi"], + ) + elif query_type == "doi": + ads_matches = ads.SearchQuery( + doi=value, fl=["id", "bibcode", "title", "first_author", "year", "doi"] + ) + else: + logger.error(f"Invalid query type: {query_type}. Valid types are 'arxiv', 'bibcode', or 'doi'") + return - description = article.title[0] - bibcode_add = article.bibcode - doi_add = article.doi[0] - + ads_matches_list = list(ads_matches) + + if len(ads_matches_list) == 0: + msg = f"No ADS matches for {query_type}: {value}" + logger.warning(msg) + return + + if len(ads_matches_list) > 1: + logger.warning(f"More than one matching ADS record for {query_type}: {value}") + return + + if len(ads_matches_list) == 1: + logger.debug(f"Publication found in ADS for {query_type}: {value}") + article = ads_matches_list[0] + logger.debug( + f"{article.first_author}, {article.year}, {article.bibcode}, {article.doi}, {article.title}" + ) + if not reference: # generate the name if it was not provided + name_stub = article.first_author.replace(",", "").replace(" ", "") + name_add = name_stub[0:4] + article.year[-2:] else: name_add = reference - bibcode_add = arxiv_id - doi_add = doi - description = None + + description = article.title[0] + bibcode_add = article.bibcode + doi_add = article.doi[0] return name_add, bibcode_add, doi_add, description + def find_dates_in_reference(reference): # Try to find numbers in the reference which might be a date dates = re.findall(r"\d+", reference) @@ -446,4 +406,4 @@ def find_dates_in_reference(reference): logger.debug(f"Could not find a two digit date using {dates}") two_digit_date = None - return two_digit_date \ No newline at end of file + return two_digit_date diff --git a/astrodb_utils/tests/test_publications.py b/astrodb_utils/tests/test_publications.py index 6654c22..7458d4a 100644 --- a/astrodb_utils/tests/test_publications.py +++ b/astrodb_utils/tests/test_publications.py @@ -1,12 +1,7 @@ import pytest from astrodb_utils import AstroDBError -from astrodb_utils.publications import ( - find_dates_in_reference, - find_pub_using_arxiv_id, - find_publication, - ingest_publication, -) +from astrodb_utils.publications import find_dates_in_reference, find_publication, ingest_publication, search_ads, check_ads_token def test_find_publication(db): @@ -46,17 +41,26 @@ def test_ingest_publication_errors(db): assert " similar publication already exists" in str(error_message.value) # TODO - Mock environment where ADS_TOKEN is not set. #117 + ingest_publication(db, bibcode="2024ApJ...962..177B", ignore_ads=True) + -@pytest.mark.skip(reason="Need to set up mock environment for ADS_TOKEN") def test_ingest_publication(db): - ingest_publication(db, bibcode="2023arXiv230812107B") - assert find_publication(db, reference="Burg24")[0] # True + if check_ads_token() is False: + pytest.skip("ADS_TOKEN not set") + ingest_publication(db, bibcode="2023arXiv230812107B") + assert find_publication(db, reference="Burg24")[0] # True + + ingest_publication(db, reference="test05", bibcode="2024ApJ...962..177B", ignore_ads=True) + assert find_publication(db, reference="test05")[0] # True + + ingest_publication(db, reference="test10", doi="10.1086/513700", ignore_ads=True) + assert find_publication(db, reference="test10")[0] # True -@pytest.mark.skip(reason="Need to set up mock environment for ADS_TOKEN") -def test_find_pub_using_arxix_id(db): - name_add, bibcode_add, doi_add, description = find_pub_using_arxiv_id( - "2023arXiv230812107B", reference=None, doi=None, ignore_ads=False +@pytest.mark.skipif(check_ads_token() is False, reason="ADS_TOKEN not set") +def test_search_ads_using_arxix_id(db): + name_add, bibcode_add, doi_add, description = search_ads( + "2023arXiv230812107B", query_type="arxiv", reference=None, ) assert name_add == "Burg24" @@ -67,14 +71,37 @@ def test_find_pub_using_arxix_id(db): == "UNCOVER: JWST Spectroscopy of Three Cold Brown Dwarfs at Kiloparsec-scale Distances" ) - results = find_pub_using_arxiv_id( - "2022arXiv220800211G", reference=None, doi=None, ignore_ads=False + results = search_ads( + "2022arXiv220800211G", query_type="arxiv", reference=None, ) - print(results) assert results[0] == "Gaia23" assert results[1] == "2023A&A...674A...1G" +@pytest.mark.skipif(check_ads_token() is False, reason="ADS_TOKEN not set") +def test_search_ads_using_doi(): + results = search_ads("10.1093/mnras/staa1522", query_type="doi", reference=None) + assert results[0] == "Belo20" + assert results[1] == "2020MNRAS.496.1922B" + assert results[2] == "10.1093/mnras/staa1522" + assert results[3] == "Unresolved stellar companions with Gaia DR2 astrometry" + + results = search_ads("10.3847/1538-4357/ad206f", query_type="doi", reference="test03") + assert results[0] == "test03" + assert results[1] == "2024ApJ...962..177B" + assert results[2] == "10.3847/1538-4357/ad206f" + assert results[3] == "UNCOVER: JWST Spectroscopy of Three Cold Brown Dwarfs at Kiloparsec-scale Distances" + + +@pytest.mark.skipif(check_ads_token() is False, reason="ADS_TOKEN not set") +def test_search_ads_using_bibcode(): + results = search_ads("2020MNRAS.496.1922B", query_type="bibcode", reference="Blah98") + assert results[0] == "Blah98" + assert results[1] == "2020MNRAS.496.1922B" + assert results[2] == "10.1093/mnras/staa1522" + assert results[3] == "Unresolved stellar companions with Gaia DR2 astrometry" + + def test_find_dates_in_reference(): assert find_dates_in_reference("Wright_2010") == "10" assert find_dates_in_reference("Refr20") == "20" From 4d90fce229277170fdb4e5e22fa6f00e7f190614 Mon Sep 17 00:00:00 2001 From: kelle Date: Tue, 1 Apr 2025 07:46:54 -0700 Subject: [PATCH 06/15] lint --- astrodb_utils/tests/test_publications.py | 38 ++++++++++++++++-------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/astrodb_utils/tests/test_publications.py b/astrodb_utils/tests/test_publications.py index 7458d4a..7580b6b 100644 --- a/astrodb_utils/tests/test_publications.py +++ b/astrodb_utils/tests/test_publications.py @@ -1,7 +1,13 @@ import pytest from astrodb_utils import AstroDBError -from astrodb_utils.publications import find_dates_in_reference, find_publication, ingest_publication, search_ads, check_ads_token +from astrodb_utils.publications import ( + check_ads_token, + find_dates_in_reference, + find_publication, + ingest_publication, + search_ads, +) def test_find_publication(db): @@ -45,12 +51,9 @@ def test_ingest_publication_errors(db): def test_ingest_publication(db): - if check_ads_token() is False: - pytest.skip("ADS_TOKEN not set") - ingest_publication(db, bibcode="2023arXiv230812107B") - assert find_publication(db, reference="Burg24")[0] # True - - ingest_publication(db, reference="test05", bibcode="2024ApJ...962..177B", ignore_ads=True) + ingest_publication( + db, reference="test05", bibcode="2024ApJ...962..177B", ignore_ads=True + ) assert find_publication(db, reference="test05")[0] # True ingest_publication(db, reference="test10", doi="10.1086/513700", ignore_ads=True) @@ -60,7 +63,9 @@ def test_ingest_publication(db): @pytest.mark.skipif(check_ads_token() is False, reason="ADS_TOKEN not set") def test_search_ads_using_arxix_id(db): name_add, bibcode_add, doi_add, description = search_ads( - "2023arXiv230812107B", query_type="arxiv", reference=None, + "2023arXiv230812107B", + query_type="arxiv", + reference=None, ) assert name_add == "Burg24" @@ -72,7 +77,9 @@ def test_search_ads_using_arxix_id(db): ) results = search_ads( - "2022arXiv220800211G", query_type="arxiv", reference=None, + "2022arXiv220800211G", + query_type="arxiv", + reference=None, ) assert results[0] == "Gaia23" assert results[1] == "2023A&A...674A...1G" @@ -86,16 +93,23 @@ def test_search_ads_using_doi(): assert results[2] == "10.1093/mnras/staa1522" assert results[3] == "Unresolved stellar companions with Gaia DR2 astrometry" - results = search_ads("10.3847/1538-4357/ad206f", query_type="doi", reference="test03") + results = search_ads( + "10.3847/1538-4357/ad206f", query_type="doi", reference="test03" + ) assert results[0] == "test03" assert results[1] == "2024ApJ...962..177B" assert results[2] == "10.3847/1538-4357/ad206f" - assert results[3] == "UNCOVER: JWST Spectroscopy of Three Cold Brown Dwarfs at Kiloparsec-scale Distances" + assert ( + results[3] + == "UNCOVER: JWST Spectroscopy of Three Cold Brown Dwarfs at Kiloparsec-scale Distances" + ) @pytest.mark.skipif(check_ads_token() is False, reason="ADS_TOKEN not set") def test_search_ads_using_bibcode(): - results = search_ads("2020MNRAS.496.1922B", query_type="bibcode", reference="Blah98") + results = search_ads( + "2020MNRAS.496.1922B", query_type="bibcode", reference="Blah98" + ) assert results[0] == "Blah98" assert results[1] == "2020MNRAS.496.1922B" assert results[2] == "10.1093/mnras/staa1522" From 6ecf9dc26b247bfa10716c1dc0ad6dbece8493e8 Mon Sep 17 00:00:00 2001 From: kelle Date: Thu, 3 Apr 2025 11:23:24 -0700 Subject: [PATCH 07/15] try setting ADS_TOKEN using github secrets --- .github/workflows/run_tests.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 42aec4c..f840a35 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -22,6 +22,16 @@ jobs: python-version: ["3.11", "3.12", "3.13"] steps: + - name: Set ADS_TOKEN + shell: bash + env: + ADS_TOKEN: ${{ secrets.ADS_TOKEN }} + run: | + if [[ -z "${ADS_TOKEN}" ]]; then + echo "ADS_TOKEN is not set, skipping ADS tests" + else + echo "ADS_TOKEN is set" + fi - uses: actions/checkout@v4 - name: Checkout template database repo From 14314033804ffa6f8522b09d450a313d338b25f5 Mon Sep 17 00:00:00 2001 From: kelle Date: Thu, 3 Apr 2025 11:48:21 -0700 Subject: [PATCH 08/15] try setting environment variable at the top level --- .github/workflows/run_tests.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index f840a35..b8a4971 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -13,26 +13,25 @@ on: permissions: contents: read +env: + ADS_TOKEN: ${{ secrets.ADS_TOKEN }} + jobs: build: - runs-on: ubuntu-latest strategy: matrix: python-version: ["3.11", "3.12", "3.13"] steps: - - name: Set ADS_TOKEN + - name: Check ADS_TOKEN shell: bash - env: - ADS_TOKEN: ${{ secrets.ADS_TOKEN }} run: | if [[ -z "${ADS_TOKEN}" ]]; then - echo "ADS_TOKEN is not set, skipping ADS tests" + echo "ADS_TOKEN is not set, ADS tests will be skipped" else echo "ADS_TOKEN is set" fi - - uses: actions/checkout@v4 - name: Checkout template database repo uses: actions/checkout@v4 From f7485604305392cde919f3c39eddcd8990e449f8 Mon Sep 17 00:00:00 2001 From: kelle Date: Thu, 3 Apr 2025 11:51:18 -0700 Subject: [PATCH 09/15] trying another spot --- .github/workflows/run_tests.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index b8a4971..5ba0969 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -13,16 +13,16 @@ on: permissions: contents: read -env: - ADS_TOKEN: ${{ secrets.ADS_TOKEN }} - -jobs: +jobs: build: runs-on: ubuntu-latest strategy: matrix: python-version: ["3.11", "3.12", "3.13"] + env: + ADS_TOKEN: ${{ secrets.ADS_TOKEN }} + steps: - name: Check ADS_TOKEN shell: bash From eef48fcf141dab1fe8098b849d54414e3ac5a689 Mon Sep 17 00:00:00 2001 From: kelle Date: Thu, 3 Apr 2025 11:52:06 -0700 Subject: [PATCH 10/15] remove env setting entirely --- .github/workflows/run_tests.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 5ba0969..821c0cc 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -20,9 +20,6 @@ jobs: matrix: python-version: ["3.11", "3.12", "3.13"] - env: - ADS_TOKEN: ${{ secrets.ADS_TOKEN }} - steps: - name: Check ADS_TOKEN shell: bash From 213ead1421c52fdee71eab92a48aa2dab40e8286 Mon Sep 17 00:00:00 2001 From: kelle Date: Thu, 3 Apr 2025 11:54:23 -0700 Subject: [PATCH 11/15] trying to undo the damage --- .github/workflows/run_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 821c0cc..5046dcc 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -21,6 +21,7 @@ jobs: python-version: ["3.11", "3.12", "3.13"] steps: + - uses: actions/checkout@v4 - name: Check ADS_TOKEN shell: bash run: | From c722892e85f094661a32f3833a5b0c91f413ca9f Mon Sep 17 00:00:00 2001 From: kelle Date: Thu, 3 Apr 2025 11:56:14 -0700 Subject: [PATCH 12/15] adding env back in --- .github/workflows/run_tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 5046dcc..7a95d4b 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -24,6 +24,8 @@ jobs: - uses: actions/checkout@v4 - name: Check ADS_TOKEN shell: bash + env: + ADS_TOKEN: ${{ secrets.ADS_TOKEN }} run: | if [[ -z "${ADS_TOKEN}" ]]; then echo "ADS_TOKEN is not set, ADS tests will be skipped" From d91afd499e37ed880161dfeaed9d08db919779c7 Mon Sep 17 00:00:00 2001 From: kelle Date: Thu, 3 Apr 2025 13:44:51 -0700 Subject: [PATCH 13/15] remove duplicate --- .github/workflows/run_tests.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 880c449..7a95d4b 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -33,17 +33,6 @@ jobs: echo "ADS_TOKEN is set" fi - - name: Check ADS_TOKEN - shell: bash - env: - ADS_TOKEN: ${{ secrets.ADS_TOKEN }} - run: | - if [[ -z "${ADS_TOKEN}" ]]; then - echo "ADS_TOKEN is not set, ADS tests will be skipped" - else - echo "ADS_TOKEN is set" - fi - - name: Checkout template database repo uses: actions/checkout@v4 with: From 92e47377efb1a0e2f8a6c4bc3f66a7685bf9632b Mon Sep 17 00:00:00 2001 From: kelle Date: Tue, 8 Apr 2025 16:50:45 -0400 Subject: [PATCH 14/15] docstring and _ for convienence functions --- astrodb_utils/publications.py | 46 +++++++++++++++++++++--- astrodb_utils/tests/test_publications.py | 18 +++++----- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/astrodb_utils/publications.py b/astrodb_utils/publications.py index f5ca30e..4080919 100644 --- a/astrodb_utils/publications.py +++ b/astrodb_utils/publications.py @@ -141,7 +141,7 @@ def find_publication( if logger.level == 10: # debug pub_search_table.pprint_all() - two_digit_date = find_dates_in_reference(reference) + two_digit_date = _find_dates_in_reference(reference) if two_digit_date: logger.debug(f"Trying to limit using {two_digit_date}") @@ -170,7 +170,7 @@ def find_publication( if n_pubs_found == 0 and bibcode and "arXiv" in bibcode and use_ads: logger.debug(f"Using ADS to find alt name for {bibcode}") - results = search_ads( + results = _search_ads( bibcode, query_type="arxiv" ) bibcode_alt = results[1] @@ -282,7 +282,7 @@ def ingest_publication( return logger.debug(f"Searching ADS using {query_type}: {value}, reference: {reference}") - name_add, bibcode_add, doi_add, description = search_ads(value, query_type=query_type, reference=reference) + name_add, bibcode_add, doi_add, description = _search_ads(value, query_type=query_type, reference=reference) else: name_add = reference bibcode_add = bibcode @@ -335,7 +335,43 @@ def check_ads_token(): return use_ads -def search_ads(value: str, query_type: Literal["arxiv","bibcode","doi"], reference): +def _search_ads(value: str, query_type: Literal["arxiv","bibcode","doi"], reference): + """ + Search ADS for a publication using the provided string and query type. + The query type indicates if the string provided is an arXiv ID, bibcode, or DOI. + The function will return the name, bibcode, doi, and description of the publication + if found. + + It will return None if no match or multiple matches are found. + + Parameters + ---------- + value: str + The value to search for in ADS. + query_type: str + The type of query to perform. Can be one of the following: + - arxiv + - bibcode + - doi + reference: str + The reference name to use if the publication is found. + If not provided, it will be generated from the first author and year. + + Returns + ------- + If no match or multiple matches are found, it returns None. + + If one match is found, it returns a tuple with the following elements: + name_add: str + The name of the publication. + bibcode_add: str + The bibcode of the publication. + doi_add: str + The DOI of the publication. + description: str + The description of the publication (usually the title). + + """ if check_ads_token() is False: logger.error("An ADS_TOKEN environment variable must be set") return @@ -387,7 +423,7 @@ def search_ads(value: str, query_type: Literal["arxiv","bibcode","doi"], referen return name_add, bibcode_add, doi_add, description -def find_dates_in_reference(reference): +def _find_dates_in_reference(reference): # Try to find numbers in the reference which might be a date dates = re.findall(r"\d+", reference) # try to find a two digit date diff --git a/astrodb_utils/tests/test_publications.py b/astrodb_utils/tests/test_publications.py index 7580b6b..4af37b2 100644 --- a/astrodb_utils/tests/test_publications.py +++ b/astrodb_utils/tests/test_publications.py @@ -2,11 +2,11 @@ from astrodb_utils import AstroDBError from astrodb_utils.publications import ( + _find_dates_in_reference, + _search_ads, check_ads_token, - find_dates_in_reference, find_publication, ingest_publication, - search_ads, ) @@ -62,7 +62,7 @@ def test_ingest_publication(db): @pytest.mark.skipif(check_ads_token() is False, reason="ADS_TOKEN not set") def test_search_ads_using_arxix_id(db): - name_add, bibcode_add, doi_add, description = search_ads( + name_add, bibcode_add, doi_add, description = _search_ads( "2023arXiv230812107B", query_type="arxiv", reference=None, @@ -76,7 +76,7 @@ def test_search_ads_using_arxix_id(db): == "UNCOVER: JWST Spectroscopy of Three Cold Brown Dwarfs at Kiloparsec-scale Distances" ) - results = search_ads( + results = _search_ads( "2022arXiv220800211G", query_type="arxiv", reference=None, @@ -87,13 +87,13 @@ def test_search_ads_using_arxix_id(db): @pytest.mark.skipif(check_ads_token() is False, reason="ADS_TOKEN not set") def test_search_ads_using_doi(): - results = search_ads("10.1093/mnras/staa1522", query_type="doi", reference=None) + results = _search_ads("10.1093/mnras/staa1522", query_type="doi", reference=None) assert results[0] == "Belo20" assert results[1] == "2020MNRAS.496.1922B" assert results[2] == "10.1093/mnras/staa1522" assert results[3] == "Unresolved stellar companions with Gaia DR2 astrometry" - results = search_ads( + results = _search_ads( "10.3847/1538-4357/ad206f", query_type="doi", reference="test03" ) assert results[0] == "test03" @@ -107,7 +107,7 @@ def test_search_ads_using_doi(): @pytest.mark.skipif(check_ads_token() is False, reason="ADS_TOKEN not set") def test_search_ads_using_bibcode(): - results = search_ads( + results = _search_ads( "2020MNRAS.496.1922B", query_type="bibcode", reference="Blah98" ) assert results[0] == "Blah98" @@ -117,5 +117,5 @@ def test_search_ads_using_bibcode(): def test_find_dates_in_reference(): - assert find_dates_in_reference("Wright_2010") == "10" - assert find_dates_in_reference("Refr20") == "20" + assert _find_dates_in_reference("Wright_2010") == "10" + assert _find_dates_in_reference("Refr20") == "20" From 95de789f2225d1163d47ed46066f7b0fe0747f96 Mon Sep 17 00:00:00 2001 From: kelle Date: Tue, 8 Apr 2025 17:17:52 -0400 Subject: [PATCH 15/15] fix bug found during review, add test --- astrodb_utils/publications.py | 44 ++++++++++++++---------- astrodb_utils/tests/test_publications.py | 3 ++ 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/astrodb_utils/publications.py b/astrodb_utils/publications.py index 4080919..48806c0 100644 --- a/astrodb_utils/publications.py +++ b/astrodb_utils/publications.py @@ -39,6 +39,7 @@ def find_publication( Returns ------- + # TODO: Return three things: Boolean, n_pubs_found, string result True, str: if only one match False, 0: No matches False, N_matches: Multiple matches @@ -173,25 +174,30 @@ def find_publication( results = _search_ads( bibcode, query_type="arxiv" ) - bibcode_alt = results[1] - not_null_pub_filters = [] - not_null_pub_filters.append(db.Publications.c.bibcode.ilike(bibcode_alt)) - print(not_null_pub_filters) - pub_search_table = Table() - pub_search_table = ( - db.query(db.Publications).filter(or_(*not_null_pub_filters)).table() - ) - if len(pub_search_table) == 1: - logger.debug( - f"Found {len(pub_search_table)} matching publications for " - f"{reference} or {doi} or {bibcode}: {pub_search_table['reference'].data}" - ) - if logger.level <= 10: # debug - pub_search_table.pprint_all() + + if results is not None: + bibcode_alt = results[1] + not_null_pub_filters = [] + not_null_pub_filters.append(db.Publications.c.bibcode.ilike(bibcode_alt)) + print(not_null_pub_filters) + pub_search_table = Table() + pub_search_table = ( + db.query(db.Publications).filter(or_(*not_null_pub_filters)).table() + ) + if len(pub_search_table) == 1: + logger.debug( + f"Found {len(pub_search_table)} matching publications for " + f"{reference} or {doi} or {bibcode}: {pub_search_table['reference'].data}" + ) + if logger.level <= 10: # debug + pub_search_table.pprint_all() + + return True, pub_search_table["reference"].data[0] + else: + return False, len(pub_search_table) + else: + return False, 0 # No matches found using arxiv in bibcode - return True, pub_search_table["reference"].data[0] - else: - return False, len(pub_search_table) else: return False, n_pubs_found @@ -335,7 +341,7 @@ def check_ads_token(): return use_ads -def _search_ads(value: str, query_type: Literal["arxiv","bibcode","doi"], reference): +def _search_ads(value: str, query_type: Literal["arxiv","bibcode","doi"], reference=None): """ Search ADS for a publication using the provided string and query type. The query type indicates if the string provided is an arXiv ID, bibcode, or DOI. diff --git a/astrodb_utils/tests/test_publications.py b/astrodb_utils/tests/test_publications.py index 4af37b2..493a0e2 100644 --- a/astrodb_utils/tests/test_publications.py +++ b/astrodb_utils/tests/test_publications.py @@ -28,6 +28,9 @@ def test_find_publication(db): assert find_publication(db, reference=None) == (False, 0) + # Test with a non-existent arxiv ID + assert find_publication(db, bibcode="2023arXiv2308121074B") == (False, 0) + # find_publication(db,bibcode="2022arXiv220800211G" )