diff --git a/astrodb_utils/sources.py b/astrodb_utils/sources.py index b90fe48..8c004c8 100644 --- a/astrodb_utils/sources.py +++ b/astrodb_utils/sources.py @@ -11,7 +11,7 @@ __all__ = [ "find_source_in_db", - "ingest_names", + "ingest_name", "ingest_source", ] @@ -28,7 +28,7 @@ def find_source_in_db( ra_col_name="ra_deg", dec_col_name="dec_deg", use_simbad=True, - fuzzy=False + fuzzy=False, ): """ Find a source in the database given a source name and optional coordinates. @@ -187,7 +187,7 @@ def coords_from_simbad(source): # NAMES -def ingest_names( +def ingest_name( db, source: str = None, other_name: str = None, raise_error: bool = None ): """ @@ -206,22 +206,26 @@ def ingest_names( Returns ------- - None + other_name: str + Name of the source as it appears in the Names table + + or None if name was not ingested + """ - names_data = [{"source": source, "other_name": other_name}] + source = strip_unicode_dashes(source) + other_name = strip_unicode_dashes(other_name) + name_data = [{"source": source, "other_name": other_name}] try: with db.engine.connect() as conn: - conn.execute(db.Names.insert().values(names_data)) + conn.execute(db.Names.insert().values(name_data)) conn.commit() - logger.info(f"Name added to database: {names_data}\n") + logger.info(f"Name added to database: {name_data}\n") + return other_name except sqlalchemy.exc.IntegrityError as e: - msg = f"Could not add {names_data} to Names." - if "UNIQUE constraint failed:" in str(e): - msg += " Other name is already present." - if raise_error: - raise AstroDBError(msg) from e - else: - logger.warning(msg) + msg = f"Could not add {name_data} to Names." + if "UNIQUE constraint failed: " in str(e): + msg += "Other name is already present." + exit_function(msg, raise_error) # SOURCES @@ -286,6 +290,9 @@ def ingest_source( logger.debug(f"Trying to ingest source: {source}") + # change unicode dashes characters to `-` + source = strip_unicode_dashes(source) + # Make sure reference is provided and in References table ref_check = find_publication(db, reference=reference) logger.debug(f"ref_check: {ref_check}") @@ -323,7 +330,7 @@ def ingest_source( # One source match in the database, ingesting possible alt name if len(name_matches) == 1: - ingest_names(db, name_matches[0], source) + ingest_name(db, name_matches[0], source) msg2 = f" Already in database as {name_matches[0]}. \n " # Multiple source matches in the database, unable to ingest source @@ -375,18 +382,15 @@ def ingest_source( msg = f"Added {source_data}" logger.info(f"Added {source}") logger.debug(msg) - except sqlalchemy.exc.IntegrityError as e: + except sqlalchemy.exc.IntegrityError: msg = f"Not ingesting {source}. Not sure why. \n" msg2 = f" {source_data} " logger.warning(msg) logger.debug(msg2) - if raise_error: - raise AstroDBError(msg + msg2) from e - else: - return + exit_function(msg + msg2, raise_error) # Add the source name to the Names table - ingest_names(db, source=source, other_name=source, raise_error=raise_error) + ingest_name(db, source=source, other_name=source, raise_error=raise_error) return @@ -462,3 +466,23 @@ def find_survey_name_in_simbad(sources, desig_prefix, source_id_index=None): ) return result_table + + +def strip_unicode_dashes(source): + """ + Function to remove unicode dashes from source names and replace with `-` + """ + + unicode_list = [ + ("\u2013", "en dash"), + ("\u2014", "em dash"), + ("\u2212", "minus sign"), + ("\u2012", "figure dash"), + ] + + for char, char_name in unicode_list: + if char in source: + source = source.replace(char, "-") + logger.info(f"replaced {char_name}({char}) with - in {source}") + + return source diff --git a/astrodb_utils/tests/conftest.py b/astrodb_utils/tests/conftest.py index 0af753e..989fa3d 100644 --- a/astrodb_utils/tests/conftest.py +++ b/astrodb_utils/tests/conftest.py @@ -5,7 +5,6 @@ from astrodb_utils import load_astrodb, logger from astrodb_utils.publications import ingest_publication -from astrodb_utils.sources import ingest_source logger.setLevel("DEBUG") @@ -28,8 +27,6 @@ def db(): logger.info("Loaded AstroDB Template database using load_astrodb function in conftest.py") - - ingest_publication( db, reference="Refr20", @@ -40,8 +37,6 @@ def db(): ingest_publication(db, doi="10.1086/161442", reference="Prob83") - ingest_source(db, "LHS 2924", reference="Prob83") - return db diff --git a/astrodb_utils/tests/test_sources.py b/astrodb_utils/tests/test_sources.py index e8edeb1..e401259 100644 --- a/astrodb_utils/tests/test_sources.py +++ b/astrodb_utils/tests/test_sources.py @@ -8,7 +8,9 @@ from astrodb_utils.sources import ( coords_from_simbad, find_source_in_db, + ingest_name, ingest_source, + strip_unicode_dashes, ) @@ -51,6 +53,14 @@ "raise_error": False, } ), + { + "source": "LHS 2924", # needed for test_find_source_in_db + "ra": None, + "dec": None, + "reference": "Prob83", + "raise_error": False, + } + ], ) @pytest.mark.filterwarnings( @@ -187,3 +197,28 @@ def test_coords_from_simbad(): coords = coords_from_simbad("Barnard Star") assert math.isclose(coords.ra.deg, 269.452, abs_tol=0.001) assert math.isclose(coords.dec.deg, 4.6933, abs_tol=0.001) + + +def test_ingest_name(db): + result = ingest_name(db, "TWA 26", "WISE J113951.07-315921.6") + assert result == "WISE J113951.07-315921.6" + + # try to ingest names that are already in the database + result = ingest_name(db, "Gl 229b", "HD 42581b", raise_error=False) + assert result is None + + with pytest.raises(AstroDBError) as error_message: + ingest_name(db, "Gl 229b", "HD 42581b", raise_error=True) + assert "Other name is already present." in str(error_message.value) + + +@pytest.mark.parametrize('input,expected', [ + ('CWISE J221706.28–145437.6', 'CWISE J221706.28-145437.6'), #en dash 2013 + ('2MASS J20115649—6201127', '2MASS J20115649-6201127'), # em dash 2014 + ('1234−5678', '1234-5678'), # minus sign 2212 + ('9W34‒aou', '9W34-aou'), # figure dash 2012 + ('should-work', 'should-work'), # no unicode dashes➖➖ +]) +def test_strip_unicode_dashes(input, expected): + result = strip_unicode_dashes(input) + assert result == expected