From ab0c9ce6f37e6716aa59017fd2761357905c0c4f Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 13 Apr 2022 09:05:53 -0400 Subject: [PATCH 01/31] bugfix: stopped curated metadata being loaded in maintenance_metadata if curated metadata is Null. --- ADSCitationCapture/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index b8812d7..a59c7c1 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -441,7 +441,7 @@ def task_maintenance_metadata(dois, bibcodes, reset = False): bibcode_replaced = {'previous': registered_record['bibcode'], 'new': parsed_metadata['bibcode'] } #Protect curated metadata from being bulldozed by metadata updates. - if curated_metadata is not {}: + if not curated_metadata: modified_metadata = db.generate_modified_metadata(parsed_metadata, curated_metadata) zenodo_bibstem = "zndo" new_bibcode = doi.build_bibcode(modified_metadata, doi.zenodo_doi_re, zenodo_bibstem) From 7fffd787239c6fffba642bafc5df33d06e838ad8 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 13 Apr 2022 09:23:50 -0400 Subject: [PATCH 02/31] More updates to maintenance metadata. --- ADSCitationCapture/tasks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index a59c7c1..2b9c032 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -459,9 +459,9 @@ def task_maintenance_metadata(dois, bibcodes, reset = False): alternate_bibcode.append(registered_record.get('bibcode')) #Add the CC generated bibcode to the parsed metadata parsed_metadata['alternate_bibcode'].append(registered_record.get('bibcode')) - parsed_metadata['alternate_bibcode'] = list(set(parsed_metadata.get('alternate_bibcode'))) - modified_metadata['bibcode'] = new_bibcode - bibcode_replaced = {'previous': registered_record['bibcode'], 'new': parsed_metadata['bibcode'] } + parsed_metadata['alternate_bibcode'] = list(set(parsed_metadata.get('alternate_bibcode'))) + modified_metadata['bibcode'] = new_bibcode + bibcode_replaced = {'previous': registered_record['bibcode'], 'new': parsed_metadata['bibcode'] } alt_bibcodes = list(set(alternate_bibcode)) modified_metadata['alternate_bibcode'] = alt_bibcodes curated_metadata['alternate_bibcode'] = alt_bibcodes From 011850eb6c3af0016dc396be4956b791d568c80f Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 13 Apr 2022 11:06:17 -0400 Subject: [PATCH 03/31] more updates. --- ADSCitationCapture/tasks.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index 2b9c032..72b848e 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -394,7 +394,7 @@ def task_maintenance_metadata(dois, bibcodes, reset = False): # Fetch DOI metadata (if HTTP request fails, an exception is raised # and the task will be re-queued (see app.py and adsputils)) - curated_metadata = registered_record.get('curated_metadata', None) + curated_metadata = registered_record.get('curated_metadata', {}) logger.debug("Curated metadata for {} is {}".format(registered_record['content'], registered_record['curated_metadata'])) raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], registered_record['content']) @@ -442,6 +442,7 @@ def task_maintenance_metadata(dois, bibcodes, reset = False): #Protect curated metadata from being bulldozed by metadata updates. if not curated_metadata: + logger.info("Re-applying curated metadata for {}".format(registered_record.get('bibcode'))) modified_metadata = db.generate_modified_metadata(parsed_metadata, curated_metadata) zenodo_bibstem = "zndo" new_bibcode = doi.build_bibcode(modified_metadata, doi.zenodo_doi_re, zenodo_bibstem) @@ -453,7 +454,6 @@ def task_maintenance_metadata(dois, bibcodes, reset = False): bibcode = registered_record.get('bibcode') if new_bibcode != registered_record.get('bibcode'): bibcode = new_bibcode - logger.warn("Parsing the new metadata for citation target '%s' produced a different bibcode: '%s'. The former will be moved to the 'alternate_bibcode' list, and the new one will be used as the main one.", registered_record['bibcode'],new_bibcode) if registered_record.get('bibcode') not in alternate_bibcode: #generate complete alt bibcode list including any curated entries alternate_bibcode.append(registered_record.get('bibcode')) @@ -461,10 +461,11 @@ def task_maintenance_metadata(dois, bibcodes, reset = False): parsed_metadata['alternate_bibcode'].append(registered_record.get('bibcode')) parsed_metadata['alternate_bibcode'] = list(set(parsed_metadata.get('alternate_bibcode'))) modified_metadata['bibcode'] = new_bibcode + logger.warn("Parsing the new metadata for citation target '%s' produced a different bibcode: '%s'. The former will be moved to the 'alternate_bibcode' list, and the new one will be used as the main one.", registered_record['bibcode'],new_bibcode) bibcode_replaced = {'previous': registered_record['bibcode'], 'new': parsed_metadata['bibcode'] } - alt_bibcodes = list(set(alternate_bibcode)) - modified_metadata['alternate_bibcode'] = alt_bibcodes - curated_metadata['alternate_bibcode'] = alt_bibcodes + alt_bibcodes = list(set(alternate_bibcode)) + curated_metadata['alternate_bibcode'] = alt_bibcodes + modified_metadata['alternate_bibcode'] = alt_bibcodes else: modified_metadata = parsed_metadata From 89d89cf5b1fbed151f629254054ecfb72bbfa1fd Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 13 Apr 2022 12:20:56 -0400 Subject: [PATCH 04/31] Added more commenting to maintenance task. Updated how alternate bibcodes are handled for maintenance_curation and maintenance_metadata. --- ADSCitationCapture/tasks.py | 58 +++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index 72b848e..494d53a 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -441,7 +441,7 @@ def task_maintenance_metadata(dois, bibcodes, reset = False): bibcode_replaced = {'previous': registered_record['bibcode'], 'new': parsed_metadata['bibcode'] } #Protect curated metadata from being bulldozed by metadata updates. - if not curated_metadata: + if curated_metadata: logger.info("Re-applying curated metadata for {}".format(registered_record.get('bibcode'))) modified_metadata = db.generate_modified_metadata(parsed_metadata, curated_metadata) zenodo_bibstem = "zndo" @@ -452,20 +452,28 @@ def task_maintenance_metadata(dois, bibcodes, reset = False): alternate_bibcode = list(set(alternate_bibcode+curated_metadata['alternate_bibcode'])) logger.debug('alternate bibcodes are {}'.format(alternate_bibcode)) bibcode = registered_record.get('bibcode') - if new_bibcode != registered_record.get('bibcode'): + #Checks if the new bibcode is now different from the one generated for parsed metadata + if new_bibcode != parsed_metadata.get('bibcode'): + if concept_doi: + new_bibcode = bibcode[:4]+new_bibcode[4:] bibcode = new_bibcode - if registered_record.get('bibcode') not in alternate_bibcode: + if parsed_metadata.get('bibcode') not in alternate_bibcode: #generate complete alt bibcode list including any curated entries - alternate_bibcode.append(registered_record.get('bibcode')) + alternate_bibcode.append(parsed_metadata.get('bibcode')) #Add the CC generated bibcode to the parsed metadata - parsed_metadata['alternate_bibcode'].append(registered_record.get('bibcode')) + parsed_metadata['alternate_bibcode'].append(parsed_metadata.get('bibcode')) + #Remove duplicate bibcodes parsed_metadata['alternate_bibcode'] = list(set(parsed_metadata.get('alternate_bibcode'))) + #set new bibcode modified_metadata['bibcode'] = new_bibcode - logger.warn("Parsing the new metadata for citation target '%s' produced a different bibcode: '%s'. The former will be moved to the 'alternate_bibcode' list, and the new one will be used as the main one.", registered_record['bibcode'],new_bibcode) + logger.warn("Parsing the new metadata for citation target '%s' produced a different bibcode: '%s'. The former will be moved to the 'alternate_bibcode' list, and the new one will be used as the main one.", parsed_metadata['bibcode'], new_bibcode) bibcode_replaced = {'previous': registered_record['bibcode'], 'new': parsed_metadata['bibcode'] } - alt_bibcodes = list(set(alternate_bibcode)) - curated_metadata['alternate_bibcode'] = alt_bibcodes - modified_metadata['alternate_bibcode'] = alt_bibcodes + #remove duplicates from full alt bibcode list + alternate_bibcode = list(set(alternate_bibcode)) + #set curated metadata alt bibcodes + curated_metadata['alternate_bibcode'] = alternate_bibcode + + modified_metadata['alternate_bibcode'] = alternate_bibcode else: modified_metadata = parsed_metadata @@ -530,12 +538,17 @@ def task_maintenance_curation(dois, bibcodes, curated_entries, reset = False): #regenerate bibcode with curated_metadata and append old bibcode to alternate_bibcode zenodo_bibstem = "zndo" + #generates new bibcodes with manual curation data new_bibcode = doi.build_bibcode(modified_metadata, doi.zenodo_doi_re, zenodo_bibstem) + #get the original list of alt bibcodes alternate_bibcode = registered_record.get('alternate_bibcode', []) + #set parsed_metadata alt bibcodes to match original list parsed_metadata['alternate_bibcode'] = registered_record.get('alternate_bibcode', []) + #checks for provided alt bibcodes from manual curation if 'alternate_bibcode' in curated_entry.keys(): alternate_bibcode = list(set(alternate_bibcode+curated_entry['alternate_bibcode'])) logger.debug('alternate bibcodes are {}'.format(alternate_bibcode)) + #checks if bibcode has changed due to manual curation metadata if new_bibcode != registered_record.get('bibcode'): logger.warn("Parsing the new metadata for citation target '%s' produced a different bibcode: '%s'. The former will be moved to the 'alternate_bibcode' list, and the new one will be used as the main one.", registered_record['bibcode'],new_bibcode) if registered_record.get('bibcode') not in alternate_bibcode: @@ -543,12 +556,18 @@ def task_maintenance_curation(dois, bibcodes, curated_entries, reset = False): alternate_bibcode.append(registered_record.get('bibcode')) #Add the CC generated bibcode to the parsed metadata parsed_metadata['alternate_bibcode'].append(registered_record.get('bibcode')) - parsed_metadata['alternate_bibcode'] = list(set(parsed_metadata.get('alternate_bibcode'))) - modified_metadata['bibcode'] = new_bibcode - bibcode_replaced = {'previous': registered_record['bibcode'], 'new': parsed_metadata['bibcode'] } - alt_bibcodes = list(set(alternate_bibcode)) - modified_metadata['alternate_bibcode'] = alt_bibcodes - curated_entry['alternate_bibcode'] = alt_bibcodes + #removes duplicates from parsed_metadata alt bibcodes + parsed_metadata['alternate_bibcode'] = list(set(parsed_metadata.get('alternate_bibcode'))) + #sets new bibcode + modified_metadata['bibcode'] = new_bibcode + #removes duplicates from all alt bibcodes including ones provided by manual curation + alternate_bibcode = list(set(alternate_bibcode)) + #updates curated entry alt bibcodes only if a new bibcode is generated due to manual curation + curated_entry['alternate_bibcode'] = alternate_bibcode + #marks bibcode as replaced + bibcode_replaced = {'previous': registered_record['bibcode'], 'new': parsed_metadata['bibcode'] } + #sets modified metadata alt bibcodes to match the full list of alt bibcodes. + modified_metadata['alternate_bibcode'] = alternate_bibcode else: #Repopulate parsed_metadata with expected bibcode information from parsed_cited_metadata. @@ -556,15 +575,22 @@ def task_maintenance_curation(dois, bibcodes, curated_entries, reset = False): #regenerate bibcode with parsed_metadata and append old bibcode to alternate_bibcode zenodo_bibstem = "zndo" new_bibcode = doi.build_bibcode(parsed_metadata, doi.zenodo_doi_re, zenodo_bibstem) + #get original alt bibcodes alternate_bibcode = registered_record.get('alternate_bibcode', []) parsed_metadata['alternate_bibcode'] = registered_record.get('alternate_bibcode', []) + #reset bibcode if changed if new_bibcode != registered_record.get('bibcode'): logger.warn("Parsing the new metadata for citation target '%s' produced a different bibcode: '%s'. The former will be moved to the 'alternate_bibcode' list, and the new one will be used as the main one.", registered_record['bibcode'],new_bibcode) + #Add old bibcode to alt bibcodes if registered_record.get('bibcode') not in alternate_bibcode: alternate_bibcode.append(registered_record.get('bibcode')) + #set bibcode replaced if necessary + bibcode_replaced = {'previous': registered_record['bibcode'], 'new': parsed_metadata['bibcode'] } + #set alt bibcodes to full list parsed_metadata['alternate_bibcode'] = list(set(alternate_bibcode)) - bibcode_replaced = {'previous': registered_record['bibcode'], 'new': parsed_metadata['bibcode'] } + #reset modified metadata modified_metadata = parsed_metadata + #clear curated metadata curated_entry = {} different_bibcodes = registered_record['bibcode'] != modified_metadata['bibcode'] From 70d8f681baccbde5a9f6e8e1a055dd217957dde9 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 13 Apr 2022 13:43:44 -0400 Subject: [PATCH 05/31] Added functionality to sanitize citation_change content for DOI targets. --- ADSCitationCapture/doi.py | 10 ++++++++++ ADSCitationCapture/tasks.py | 9 ++++++++- requirements.txt | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/ADSCitationCapture/doi.py b/ADSCitationCapture/doi.py index 9ae22a5..fa9ae36 100644 --- a/ADSCitationCapture/doi.py +++ b/ADSCitationCapture/doi.py @@ -21,6 +21,7 @@ dc = DataCiteParser() zenodo_doi_re = re.compile(r"^10.\d{4,9}/zenodo\.([0-9]*)$", re.IGNORECASE) +zenodo_doi_reset = re.compile(r"10.\d{4,9}/zenodo\.([0-9]*)", re.IGNORECASE) upper_case_az_character_re = re.compile("[A-Z]") @@ -176,6 +177,15 @@ def parse_metadata(raw_metadata): """ return _parse_metadata_zenodo_doi(raw_metadata) +def sanitize_zenodo_doi(doi): + """ + Takes the imported citation_change content and tries to sanitize it if it is a zenodo doi. + """ + return _sanitize_zendo_doi(zenodo_doi_reset, doi) + +def _sanitize_zendo_doi(zenodo_doi_reset, doi): + return re.search(zenodo_doi_reset, doi).group(0) + def _parse_metadata_zenodo_doi(raw_metadata): """ It expects metadata in datacite format from a zenodo DOI [string] and returns diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index 494d53a..cf5a2bb 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -49,7 +49,14 @@ def task_process_new_citation(citation_change, force=False): content_type = None is_link_alive = False status = "DISCARDED" - + if citation_change.content_type == adsmsg.CitationChangeContentType.doi \ + and citation_change.content not in ["", None]: + clean_doi = doi.sanitize_zenodo_doi(citation_change.content) + if clean_doi: + citation_change.content = clean_doi + else: + logger.warn("Failed to sanitize DOI for {}".format(citation_change.content)) + # Check if we already have the citation target in the DB metadata = db.get_citation_target_metadata(app, citation_change.content) citation_target_in_db = bool(metadata) # False if dict is empty diff --git a/requirements.txt b/requirements.txt index 6dc79cc..c8ded45 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ alembic==0.9.3 sqlalchemy-postgres-copy==0.5.0 SQLAlchemy-Continuum==1.3.11 beautifulsoup4==4.9.3 -astropy==4.2.1 +astropy==5.0.2 portalocker==1.7.1 SQLAlchemy-Utils==0.37.8 unidecode==0.04.21 From 317c96d3632b67bff4f58d2b7d30b398f9dd6e7a Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 13 Apr 2022 13:44:02 -0400 Subject: [PATCH 06/31] Added functionality to sanitize citation_change content for DOI targets. --- ADSCitationCapture/tasks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index cf5a2bb..f156f96 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -51,12 +51,13 @@ def task_process_new_citation(citation_change, force=False): status = "DISCARDED" if citation_change.content_type == adsmsg.CitationChangeContentType.doi \ and citation_change.content not in ["", None]: + #attempts to sanitize the DOI to make it more likely to be valid clean_doi = doi.sanitize_zenodo_doi(citation_change.content) if clean_doi: citation_change.content = clean_doi else: logger.warn("Failed to sanitize DOI for {}".format(citation_change.content)) - + # Check if we already have the citation target in the DB metadata = db.get_citation_target_metadata(app, citation_change.content) citation_target_in_db = bool(metadata) # False if dict is empty From 6182e4a0ae2b7065a575e4d388ce0356fb1eecf3 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 13 Apr 2022 14:38:12 -0400 Subject: [PATCH 07/31] Added sanitization to task_maintenance_reevaluate --- ADSCitationCapture/tasks.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index f156f96..fcc7146 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -54,6 +54,7 @@ def task_process_new_citation(citation_change, force=False): #attempts to sanitize the DOI to make it more likely to be valid clean_doi = doi.sanitize_zenodo_doi(citation_change.content) if clean_doi: + logger.info("Replacing citation_change.content: {} with sanitized version: {}".format(citation_change.content, clean_doi)) citation_change.content = clean_doi else: logger.warn("Failed to sanitize DOI for {}".format(citation_change.content)) @@ -796,21 +797,30 @@ def task_maintenance_reevaluate(dois, bibcodes): # Fetch DOI metadata (if HTTP request fails, an exception is raised # and the task will be re-queued (see app.py and adsputils)) if previously_discarded_record['content_type'] == 'DOI': - raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], previously_discarded_record['content']) + #Try and sanitize the DOI before reevaluating + clean_doi = doi.sanitize_zenodo_doi(previously_discarded_record.get('content')) + if clean_doi: + logger.info("Replacing citation_change.content: {} with sanitized version: {}".format(previously_discarded_record.get('content'), clean_doi)) + else: + logger.warn("Failed to sanitize DOI for {}".format(previously_discarded_record.get('content'))) + clean_doi = previously_discarded_record.get('content') + + #Fetch metadata and process + raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], clean_doi) if raw_metadata: parsed_metadata = doi.parse_metadata(raw_metadata) is_software = parsed_metadata.get('doctype', '').lower() == "software" if not is_software: - logger.error("Discarded '%s', it is not 'software'", previously_discarded_record['content']) + logger.error("Discarded '%s', it is not 'software'", clean_doi) elif parsed_metadata.get('bibcode') in (None, ""): - logger.error("The metadata for '%s' could not be parsed correctly and it did not correctly compute a bibcode", previously_discarded_record['content']) + logger.error("The metadata for '%s' could not be parsed correctly and it did not correctly compute a bibcode", clean_doi) else: # Create citation target in the DB - updated = db.update_citation_target_metadata(app, previously_discarded_record['content'], raw_metadata, parsed_metadata, status='REGISTERED') + updated = db.update_citation_target_metadata(app, clean_doi, raw_metadata, parsed_metadata, status='REGISTERED') if updated: - db.mark_all_discarded_citations_as_registered(app, previously_discarded_record['content']) + db.mark_all_discarded_citations_as_registered(app, clean_doi) if updated: - citation_change = adsmsg.CitationChange(content=previously_discarded_record['content'], + citation_change = adsmsg.CitationChange(content=clean_doi, content_type=getattr(adsmsg.CitationChangeContentType, previously_discarded_record['content_type'].lower()), status=adsmsg.Status.new, timestamp=datetime.now() From 88c6df4180a6a0c59116347af3e998b40818986d Mon Sep 17 00:00:00 2001 From: tjacovich Date: Thu, 14 Apr 2022 11:46:08 -0400 Subject: [PATCH 08/31] More work on sanitization --- ADSCitationCapture/db.py | 57 +++++++++++++++++- ADSCitationCapture/doi.py | 11 +++- ADSCitationCapture/models.py | 2 +- ADSCitationCapture/tasks.py | 60 +++++++++++++++---- README.md | 20 +++---- .../8e83568feb1d_add_sanitized_value.py | 50 ++++++++++++++++ run.py | 7 +-- 7 files changed, 175 insertions(+), 32 deletions(-) create mode 100644 alembic/versions/8e83568feb1d_add_sanitized_value.py diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index e99061b..21cb524 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -71,9 +71,9 @@ def _update_citation_target_metadata_session(session, content, raw_metadata, par raw_metadata = raw_metadata.decode('utf-8') except UnicodeEncodeError: pass - if citation_target.raw_cited_metadata != raw_metadata or citation_target.parsed_cited_metadata != parsed_metadata or \ - (status is not None and citation_target.status != status) or citation_target.curated_metadata != curated_metadata or \ - citation_target.bibcode != bibcode: + + if status == 'SANITIZED': + #reset status but otherwise leave the citation target alone citation_target.raw_cited_metadata = raw_metadata citation_target.parsed_cited_metadata = parsed_metadata citation_target.curated_metadata = curated_metadata @@ -86,6 +86,22 @@ def _update_citation_target_metadata_session(session, content, raw_metadata, par metadata_updated = True return metadata_updated + else: + if citation_target.raw_cited_metadata != raw_metadata or citation_target.parsed_cited_metadata != parsed_metadata or \ + (status is not None and citation_target.status != status) or citation_target.curated_metadata != curated_metadata or \ + citation_target.bibcode != bibcode: + citation_target.raw_cited_metadata = raw_metadata + citation_target.parsed_cited_metadata = parsed_metadata + citation_target.curated_metadata = curated_metadata + citation_target.bibcode = bibcode + if status is not None: + citation_target.status = status + session.add(citation_target) + session.commit() + logger.info("Updated metadata for citation target '%s' (alternative bibcodes '%s')", content, ", ".join(parsed_metadata.get('alternate_bibcode', []))) + metadata_updated = True + return metadata_updated + def update_citation_target_metadata(app, content, raw_metadata, parsed_metadata, curated_metadata = {}, status=None, bibcode = None): """ Update metadata for a citation target @@ -285,6 +301,19 @@ def get_citations(app, citation_change): citation_bibcodes = [r.citing for r in session.query(Citation).filter_by(content=citation_change.content, status="REGISTERED").all()] return citation_bibcodes +def get_citation_data(app, citing_bibcode, content): + with app.session_scope() as session: + citation_change = session.query(Citation).filter_by(content=content, status="REGISTERED", citing=citing_bibcode).first() + if citation_change: + citation = Citation() + citation.citing = citation_change.citing + citation.cited = citation_change.cited + citation.content = citation_change.content + citation.resolved = citation_change.resolved + citation.timestamp = citation_change.timestamp.ToDatetime().replace(tzinfo=tzutc()) + citation.status = citation_change.status + return citation + def generate_modified_metadata(parsed_metadata, curated_entry): """ modify parsed_metadata with any curated metadata. return results. @@ -339,6 +368,28 @@ def update_citation(app, citation_change): logger.info("Ignoring citation update (citing '%s', content '%s' and timestamp '%s') because received timestamp is equal/older than timestamp in database", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString()) return updated +def update_citation_content(app, citation_change, old_content): + """ + Update citation record information + """ + updated = False + with app.session_scope() as session: + citation = session.query(Citation).with_for_update().filter_by(citing=citation_change.citing, content=old_content).first() + change_timestamp = citation_change.timestamp.ToDatetime().replace(tzinfo=tzutc()) # Consider it as UTC to be able to compare it + if citation.timestamp < change_timestamp: + #citation.citing = citation_change.citing # This should not change + citation.content = citation_change.content # This should not change except in a very specific circumstance related to sanitizing dois + citation.cited = citation_change.cited + citation.resolved = citation_change.resolved + citation.timestamp = change_timestamp + session.add(citation) + session.commit() + updated = True + logger.info("Updated citation (citing '%s', content '%s' and timestamp '%s')", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString()) + else: + logger.info("Ignoring citation update (citing '%s', content '%s' and timestamp '%s') because received timestamp is equal/older than timestamp in database", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString()) + return updated + def mark_citation_as_deleted(app, citation_change): """ Update status to DELETED for a given citation diff --git a/ADSCitationCapture/doi.py b/ADSCitationCapture/doi.py index fa9ae36..e95f99e 100644 --- a/ADSCitationCapture/doi.py +++ b/ADSCitationCapture/doi.py @@ -182,9 +182,16 @@ def sanitize_zenodo_doi(doi): Takes the imported citation_change content and tries to sanitize it if it is a zenodo doi. """ return _sanitize_zendo_doi(zenodo_doi_reset, doi) - + def _sanitize_zendo_doi(zenodo_doi_reset, doi): - return re.search(zenodo_doi_reset, doi).group(0) + doi_root = '10.5281' + try: + #splits apart any conjoined dois and takes the first full one. + spl_doi = doi_root + doi.split(doi_root)[1] + return re.search(zenodo_doi_reset, spl_doi).group(0) + except: + logger.error("Unable to parse content: {}".format(doi)) + return None def _parse_metadata_zenodo_doi(raw_metadata): """ diff --git a/ADSCitationCapture/models.py b/ADSCitationCapture/models.py index 413ec15..e763f5e 100644 --- a/ADSCitationCapture/models.py +++ b/ADSCitationCapture/models.py @@ -14,7 +14,7 @@ citation_content_type = ENUM('DOI', 'PID', 'URL', name='citation_content_type') citation_change_type = ENUM('NEW', 'DELETED', 'UPDATED', name='citation_change_type') citation_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', name='citation_status_type') -target_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', name='target_status_type') +target_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', 'SANITIZED', name='target_status_type') class RawCitation(Base): diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index fcc7146..8609afa 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -49,15 +49,15 @@ def task_process_new_citation(citation_change, force=False): content_type = None is_link_alive = False status = "DISCARDED" - if citation_change.content_type == adsmsg.CitationChangeContentType.doi \ - and citation_change.content not in ["", None]: - #attempts to sanitize the DOI to make it more likely to be valid - clean_doi = doi.sanitize_zenodo_doi(citation_change.content) - if clean_doi: - logger.info("Replacing citation_change.content: {} with sanitized version: {}".format(citation_change.content, clean_doi)) - citation_change.content = clean_doi - else: - logger.warn("Failed to sanitize DOI for {}".format(citation_change.content)) + # if citation_change.content_type == adsmsg.CitationChangeContentType.doi \ + # and citation_change.content not in ["", None]: + # #attempts to sanitize the DOI to make it more likely to be valid + # clean_doi = doi.sanitize_zenodo_doi(citation_change.content) + # if clean_doi: + # logger.info("Replacing citation_change.content: {} with sanitized version: {}".format(citation_change.content, clean_doi)) + # citation_change.content = clean_doi + # else: + # logger.warn("Failed to sanitize DOI for {}".format(citation_change.content)) # Check if we already have the citation target in the DB metadata = db.get_citation_target_metadata(app, citation_change.content) @@ -816,9 +816,44 @@ def task_maintenance_reevaluate(dois, bibcodes): logger.error("The metadata for '%s' could not be parsed correctly and it did not correctly compute a bibcode", clean_doi) else: # Create citation target in the DB - updated = db.update_citation_target_metadata(app, clean_doi, raw_metadata, parsed_metadata, status='REGISTERED') - if updated: - db.mark_all_discarded_citations_as_registered(app, clean_doi) + #If the DOI has been sanitized, CC needs to update the content of each citation + if clean_doi != previously_discarded_record.get('content'): + #check to make sure clean doi doesn't already exist + metadata = db.get_citation_target_metadata(app, clean_doi) + citation_target_in_db = bool(metadata) + citation_change = adsmsg.CitationChange(content=clean_doi, + content_type=getattr(adsmsg.CitationChangeContentType, previously_discarded_record['content_type'].lower()), + status=adsmsg.Status.new, + timestamp=datetime.now() + ) + #Check if sanitized record is already in db + if citation_target_in_db: + logger.warn("Sanitized doi: {} already exists in db. Pointing citations to new target.".format(clean_doi)) + stored = True + updated = db.update_citation_target_metadata(app, previously_discarded_record['content_type'], raw_metadata, parsed_metadata, status='SANITIZED') + #Add citation target to database. Update old citation to SANITIZED + else: + stored = db.store_citation_target(app, citation_change, previously_discarded_record['content_type'], raw_metadata, parsed_metadata, status='REGISTERED') + updated = db.update_citation_target_metadata(app, previously_discarded_record['content_type'], raw_metadata, parsed_metadata, status='SANITIZED') + #If stored, go through and find all citations to the old doi and point them to the new record. + if stored: + #Mark all citations to original target as 'REGISTERED' + db.mark_all_discarded_citations_as_registered(app, previously_discarded_record.get('content')) + #Update the content of each citation connected to previously discarded record. + original_citations = db.get_citations(app, citation_change) + for cite in original_citations: + logger.debug("Updating content to {} for citing bibcode: {}".format(clean_doi, cite)) + #Fetch full citation object for each citation + citation_data = db.get_citation_data(app, cite, citation_change.content) + #replace content + citation_data.content = clean_doi + #update citation + db.update_citation_content(app, citation_data, previously_discarded_record.get('content')) + #Update the citation target if the content hasn't changed. + else: + updated = db.update_citation_target_metadata(app, clean_doi, raw_metadata, parsed_metadata, status='REGISTERED') + + #If there are updates to records, send the updates to the master TODO: Should we emit events as well? if updated: citation_change = adsmsg.CitationChange(content=clean_doi, content_type=getattr(adsmsg.CitationChangeContentType, previously_discarded_record['content_type'].lower()), @@ -830,6 +865,7 @@ def task_maintenance_reevaluate(dois, bibcodes): original_citations = db.get_citations_by_bibcode(app, parsed_metadata['bibcode']) citations = api.get_canonical_bibcodes(app, original_citations) logger.debug("Calling 'task_output_results' with '%s'", citation_change) + #_emit_citation_change(citation_change, parsed_metadata) task_output_results.delay(citation_change, parsed_metadata, citations, bibcode_replaced=bibcode_replaced) @app.task(queue='output-results') diff --git a/README.md b/README.md index 5b9f7ec..4cbc79a 100644 --- a/README.md +++ b/README.md @@ -513,18 +513,18 @@ Currently only sends newly registered records to Master. ``` -# Curating based on an input file. -python3 run.py MAINTENANCE --curation --input_filename $path/to/input_file -# Curating based on JSON from a command line argument by bibcode. -python3 run.py MAINTENANCE --curation --bibcode "YYYYzndo...BCDEFGR" --json {'curated_metadata'} -# Curating based on JSON from a command line argument by DOI. -python3 run.py MAINTENANCE --curation --doi "10.XYZA/ZENODO.BCDEFG" --json {'curated_metadata'} -# Clear curated_metadata for a given entry by bibcode -python3 run.py MAINTENANCE --curation --bibcode "YYYYzndo...BCDEFGR" --reset -# Clear curated_metadata for a given entry by doi + # Curating based on an input file. + python3 run.py MAINTENANCE --curation --input_filename $path/to/input_file + # Curating based on JSON from a command line argument by bibcode. +python3 run.py MAINTENANCE --curation --bibcode "2021zndo...5659382R" --json '{"abstract": "Analysis software for COS observations of PG quasars from QUEST sample: Veilleux et al. 2022, ApJ, 926, 60."}' + # Curating based on JSON from a command line argument by DOI. + python3 run.py MAINTENANCE --curation --doi "10.5281/zenodo.5659382" --json '{"abstract": "Analysis software for COS observations of PG quasars from QUEST sample: Veilleux et al. 2022, ApJ, 926, 60."}' + # Clear curated_metadata for a given entry by bibcode + python3 run.py MAINTENANCE --curation --bibcode "YYYYzndo...BCDEFGR" --reset + # Clear curated_metadata for a given entry by doi python3 run.py MAINTENANCE --curation --doi "10.XYZA/ZENODO.BCDEFG" --reset # Clear curated_metadata by file -python3 run.py MAINTENANCE --curation --input_filename $/path/to/input_file --reset +python3 run.py MAINTENANCE --curation --input_filename /path/to/input_file --reset # Display current metadata for a given entry by doi as standard output python3 run.py MAINTENANCE --curation --doi "10.XYZA/ZENODO.BCDEFG" --show # Display current metadata for a given entry by bibcode as standard output diff --git a/alembic/versions/8e83568feb1d_add_sanitized_value.py b/alembic/versions/8e83568feb1d_add_sanitized_value.py new file mode 100644 index 0000000..b9656fb --- /dev/null +++ b/alembic/versions/8e83568feb1d_add_sanitized_value.py @@ -0,0 +1,50 @@ +"""add_sanitized_value + +Revision ID: 8e83568feb1d +Revises: 7021071e5e63 +Create Date: 2022-04-14 09:24:42.277371 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '8e83568feb1d' +down_revision = '7021071e5e63' +branch_labels = None +depends_on = None + + +def upgrade(): + op.execute('COMMIT') + op.execute("ALTER TYPE target_status_type ADD VALUE 'SANITIZED'") + op.execute("ALTER TYPE citation_status_type ADD VALUE 'SANITIZED'") + + +def downgrade(): +#Move expanded status types to old + op.execute("ALTER TYPE target_status_type RENAME TO target_status_type_old") + op.execute("CREATE TYPE target_status_type AS ENUM('REGISTERED', 'DELETED', 'UPDATED', 'EMITTABLE')") + + #instantiate original status types + op.execute("ALTER TYPE citation_status_type RENAME TO citation_status_type_old") + op.execute("CREATE TYPE citation_status_type AS ENUM('REGISTERED', 'DELETED', 'UPDATED', 'EMITTABLE')") + + #DROP expanded status columns + op.drop_column('citation_target_version','status') + op.drop_column('citation_target','status') + op.drop_column('citation_version','status') + op.drop_column('citation','status') + + #DROP old ENUM types + op.execute("DROP TYPE target_status_type_old") + op.execute("DROP TYPE citation_status_type_old") + + #ADD original status columns + op.add_column('citation_target',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', 'EMITTABLE', name='target_status_type'), nullable=True)) + op.add_column('citation_target_version',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', 'EMITTABLE', name='target_status_type'), nullable=True)) + op.add_column('citation_version',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', 'EMITTABLE', name='citation_status_type'), nullable=True)) + op.add_column('citation',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', 'EMITTABLE', name='citation_status_type'), nullable=True)) + + diff --git a/run.py b/run.py index d6b13b5..d1805d7 100755 --- a/run.py +++ b/run.py @@ -174,12 +174,12 @@ def maintenance_curation(filename = None, dois = None, bibcodes = None, json_pay raise ValueError(msg) try: #convert json line to list of dicts, 1 dict per entry. - curated_entries = [json.loads(json_payload)] + curated_entries = [json.loads(p) for p in json_payload] if dois: - for ele, doi in enumerate(ele, dois): + for ele, doi in enumerate(dois): curated_entries[ele]['doi'] = doi elif bibcodes: - for ele, bibcode in enumerate(ele, bibcodes): + for ele, bibcode in enumerate(bibcodes): curated_entries[ele]['bibcode'] = bibcode except Exception as e: msg = "Parsing json arg: {}, failed with Exception: {}. Please check each entry is properly formatted.".format(json_payload, e) @@ -302,7 +302,6 @@ def _build_diagnostics(bibcodes=None, json_payloads=None): maintenance_parser.add_argument( '--json', dest='json_payload', - nargs='+', action='store', default=None, help='Space delimited list of json curated metadata') From 9b858c1fc2422885ee373057a7ddcc3630c8a923 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Thu, 14 Apr 2022 13:07:45 -0400 Subject: [PATCH 09/31] working on modifying maintenance reevaluate to update citation content for sanitized records. --- ADSCitationCapture/db.py | 32 ++++++++++++++++++-------------- ADSCitationCapture/tasks.py | 10 ++++++---- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index 21cb524..cce61ca 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -77,7 +77,7 @@ def _update_citation_target_metadata_session(session, content, raw_metadata, par citation_target.raw_cited_metadata = raw_metadata citation_target.parsed_cited_metadata = parsed_metadata citation_target.curated_metadata = curated_metadata - citation_target.bibcode = bibcode + citation_target.bibcode = None if status is not None: citation_target.status = status session.add(citation_target) @@ -310,7 +310,7 @@ def get_citation_data(app, citing_bibcode, content): citation.cited = citation_change.cited citation.content = citation_change.content citation.resolved = citation_change.resolved - citation.timestamp = citation_change.timestamp.ToDatetime().replace(tzinfo=tzutc()) + citation.timestamp = citation_change.timestamp citation.status = citation_change.status return citation @@ -375,19 +375,23 @@ def update_citation_content(app, citation_change, old_content): updated = False with app.session_scope() as session: citation = session.query(Citation).with_for_update().filter_by(citing=citation_change.citing, content=old_content).first() - change_timestamp = citation_change.timestamp.ToDatetime().replace(tzinfo=tzutc()) # Consider it as UTC to be able to compare it - if citation.timestamp < change_timestamp: - #citation.citing = citation_change.citing # This should not change - citation.content = citation_change.content # This should not change except in a very specific circumstance related to sanitizing dois - citation.cited = citation_change.cited - citation.resolved = citation_change.resolved - citation.timestamp = change_timestamp - session.add(citation) - session.commit() - updated = True - logger.info("Updated citation (citing '%s', content '%s' and timestamp '%s')", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString()) + #change_timestamp = citation_change.timestamp.ToDatetime().replace(tzinfo=tzutc()) # Consider it as UTC to be able to compare it + if citation: + if citation.timestamp < citation_change.timestamp: + #citation.citing = citation_change.citing # This should not change + citation.content = citation_change.content # This should not change except in a very specific circumstance related to sanitizing dois + citation.cited = citation_change.cited + citation.resolved = citation_change.resolved + citation.timestamp = citation_change.timestamp + session.add(citation) + session.commit() + updated = True + logger.info("Updated citation (citing '%s', content '%s' and timestamp '%s')", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString()) + else: + logger.info("Ignoring citation update (citing '%s', content '%s' and timestamp '%s') because received timestamp is equal/older than timestamp in database", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString()) else: - logger.info("Ignoring citation update (citing '%s', content '%s' and timestamp '%s') because received timestamp is equal/older than timestamp in database", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString()) + logger.info("Unable to update citation (citing '%s', content '%s' and timestamp '%s')", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString()) + return updated def mark_citation_as_deleted(app, citation_change): diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index 8609afa..3b9c24d 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -799,8 +799,9 @@ def task_maintenance_reevaluate(dois, bibcodes): if previously_discarded_record['content_type'] == 'DOI': #Try and sanitize the DOI before reevaluating clean_doi = doi.sanitize_zenodo_doi(previously_discarded_record.get('content')) - if clean_doi: - logger.info("Replacing citation_change.content: {} with sanitized version: {}".format(previously_discarded_record.get('content'), clean_doi)) + if clean_doi: + if clean_doi != previously_discarded_record.get('content'): + logger.info("Replacing citation_change.content: {} with sanitized version: {}".format(previously_discarded_record.get('content'), clean_doi)) else: logger.warn("Failed to sanitize DOI for {}".format(previously_discarded_record.get('content'))) clean_doi = previously_discarded_record.get('content') @@ -830,11 +831,11 @@ def task_maintenance_reevaluate(dois, bibcodes): if citation_target_in_db: logger.warn("Sanitized doi: {} already exists in db. Pointing citations to new target.".format(clean_doi)) stored = True - updated = db.update_citation_target_metadata(app, previously_discarded_record['content_type'], raw_metadata, parsed_metadata, status='SANITIZED') + updated = db.update_citation_target_metadata(app, previously_discarded_record['content'], raw_metadata, parsed_metadata, status='SANITIZED') #Add citation target to database. Update old citation to SANITIZED else: stored = db.store_citation_target(app, citation_change, previously_discarded_record['content_type'], raw_metadata, parsed_metadata, status='REGISTERED') - updated = db.update_citation_target_metadata(app, previously_discarded_record['content_type'], raw_metadata, parsed_metadata, status='SANITIZED') + updated = db.update_citation_target_metadata(app, previously_discarded_record['content'], raw_metadata, parsed_metadata, status='SANITIZED') #If stored, go through and find all citations to the old doi and point them to the new record. if stored: #Mark all citations to original target as 'REGISTERED' @@ -845,6 +846,7 @@ def task_maintenance_reevaluate(dois, bibcodes): logger.debug("Updating content to {} for citing bibcode: {}".format(clean_doi, cite)) #Fetch full citation object for each citation citation_data = db.get_citation_data(app, cite, citation_change.content) + citation_data.timestamp = datetime.now() #replace content citation_data.content = clean_doi #update citation From 7d7412d9372e71201797a3d9390f8fc4124204be Mon Sep 17 00:00:00 2001 From: tjacovich Date: Fri, 15 Apr 2022 12:11:53 -0400 Subject: [PATCH 10/31] Changed behavior to mark citations to sanitized records as sanitized and then create new citations to the sanitized records. --- ADSCitationCapture/db.py | 53 ++++++++++++++++++++++++++++++++---- ADSCitationCapture/models.py | 2 +- ADSCitationCapture/tasks.py | 22 +++++++++++---- 3 files changed, 64 insertions(+), 13 deletions(-) diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index cce61ca..820ce30 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -2,7 +2,7 @@ from psycopg2 import IntegrityError from dateutil.tz import tzutc from ADSCitationCapture.models import Citation, CitationTarget, Event -from adsmsg import CitationChange +from adsmsg import CitationChange, CitationChangeContentType from adsputils import setup_logging # ============================= INITIALIZATION ==================================== # @@ -17,7 +17,6 @@ level=config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=config.get('LOG_STDOUT', False)) - # =============================== FUNCTIONS ======================================= # def store_event(app, data): """ @@ -292,18 +291,21 @@ def get_citations_by_bibcode(app, bibcode): citations = get_citations(app, dummy_citation_change) return citations -def get_citations(app, citation_change): +def get_citations(app, citation_change, status='REGISTERED'): """ Return all the citations (bibcodes) to a given content. - It will ignore DELETED and DISCARDED citations. + It will ignore DELETED and DISCARDED citations by default. """ with app.session_scope() as session: - citation_bibcodes = [r.citing for r in session.query(Citation).filter_by(content=citation_change.content, status="REGISTERED").all()] + citation_bibcodes = [r.citing for r in session.query(Citation).filter_by(content=citation_change.content, status=status).all()] return citation_bibcodes def get_citation_data(app, citing_bibcode, content): + """ + Get the data for given citation + """ with app.session_scope() as session: - citation_change = session.query(Citation).filter_by(content=content, status="REGISTERED", citing=citing_bibcode).first() + citation_change = session.query(Citation).filter_by(content=content, citing=citing_bibcode).first() if citation_change: citation = Citation() citation.citing = citation_change.citing @@ -368,6 +370,20 @@ def update_citation(app, citation_change): logger.info("Ignoring citation update (citing '%s', content '%s' and timestamp '%s') because received timestamp is equal/older than timestamp in database", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString()) return updated +def citation_data_to_citation_change(citation_data, previously_discarded_record): + """ + Takes data from a citation and converts it into a citation_change. + """ + citation_change = CitationChange() + citation_change.content_type = getattr(CitationChangeContentType, previously_discarded_record['content_type'].lower()) + citation_change.content = citation_data.content + citation_change.citing = citation_data.citing + citation_change.cited = citation_data.cited + citation_change.resolved = citation_data.resolved + citation_change.timestamp.FromDatetime(citation_data.timestamp) + + return citation_change + def update_citation_content(app, citation_change, old_content): """ Update citation record information @@ -428,6 +444,31 @@ def mark_all_discarded_citations_as_registered(app, content): session.add(citation) session.commit() +def mark_citation_as_sanitized(app, citing, content): + """ + Update status to SANITIZED for all discarded citations of a given content + """ + marked_as_registered = False + previous_status = None + with app.session_scope() as session: + citation = session.query(Citation).with_for_update().filter_by(status='DISCARDED', citing=citing, content=content).first() + citation.status = 'SANITIZED' + session.add(citation) + session.commit() + +def mark_all_citations_as_sanitized(app, content): + """ + Update status to SANITIZED for all discarded citations of a given content + """ + marked_as_registered = False + previous_status = None + with app.session_scope() as session: + citations = session.query(Citation).with_for_update().filter_by(status='DISCARDED', content=content).all() + for citation in citations: + citation.status = 'SANITIZED' + session.add(citation) + session.commit() + def populate_bibcode_column(main_session, curated = True): """ Pulls all citation targets from DB and populates the bibcode column using parsed metadata diff --git a/ADSCitationCapture/models.py b/ADSCitationCapture/models.py index e763f5e..512be13 100644 --- a/ADSCitationCapture/models.py +++ b/ADSCitationCapture/models.py @@ -13,7 +13,7 @@ citation_content_type = ENUM('DOI', 'PID', 'URL', name='citation_content_type') citation_change_type = ENUM('NEW', 'DELETED', 'UPDATED', name='citation_change_type') -citation_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', name='citation_status_type') +citation_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', 'SANITIZED', name='citation_status_type') target_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', 'SANITIZED', name='target_status_type') diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index 3b9c24d..d785de6 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -832,16 +832,18 @@ def task_maintenance_reevaluate(dois, bibcodes): logger.warn("Sanitized doi: {} already exists in db. Pointing citations to new target.".format(clean_doi)) stored = True updated = db.update_citation_target_metadata(app, previously_discarded_record['content'], raw_metadata, parsed_metadata, status='SANITIZED') + #Add citation target to database. Update old citation to SANITIZED else: stored = db.store_citation_target(app, citation_change, previously_discarded_record['content_type'], raw_metadata, parsed_metadata, status='REGISTERED') updated = db.update_citation_target_metadata(app, previously_discarded_record['content'], raw_metadata, parsed_metadata, status='SANITIZED') + logger.debug("Stored is : {} for citation target {}".format(stored, previously_discarded_record['content'])) #If stored, go through and find all citations to the old doi and point them to the new record. if stored: - #Mark all citations to original target as 'REGISTERED' - db.mark_all_discarded_citations_as_registered(app, previously_discarded_record.get('content')) - #Update the content of each citation connected to previously discarded record. - original_citations = db.get_citations(app, citation_change) + #Get all DISCARDED citations + citation_change.content = previously_discarded_record.get('content', '') + original_citations = db.get_citations(app, citation_change, status = 'DISCARDED') + logger.debug("Original citations: {}".format(original_citations)) for cite in original_citations: logger.debug("Updating content to {} for citing bibcode: {}".format(clean_doi, cite)) #Fetch full citation object for each citation @@ -849,8 +851,16 @@ def task_maintenance_reevaluate(dois, bibcodes): citation_data.timestamp = datetime.now() #replace content citation_data.content = clean_doi - #update citation - db.update_citation_content(app, citation_data, previously_discarded_record.get('content')) + #store new citation + new_citation_change = db.citation_data_to_citation_change(citation_data, previously_discarded_record) + try: + db.store_citation(app, new_citation_change, new_citation_change.content_type, raw_metadata, parsed_metadata, status = 'REGISTERED') + db.mark_citation_as_sanitized(app, cite, previously_discarded_record['content']) + except Exception as e: + logger.error("Failed to update citation from {} to {} with error {}. Skipping.".format(cite, clean_doi, e)) + + #mark old citation as SANITIZED + #Update the citation target if the content hasn't changed. else: updated = db.update_citation_target_metadata(app, clean_doi, raw_metadata, parsed_metadata, status='REGISTERED') From fdbcfb1e9f353ea8f788051cb70be983711beaf5 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Fri, 15 Apr 2022 12:21:23 -0400 Subject: [PATCH 11/31] Added db function to revert citations from sanitized to discarded. --- ADSCitationCapture/db.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index 820ce30..346dcaa 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -111,7 +111,6 @@ def update_citation_target_metadata(app, content, raw_metadata, parsed_metadata, metadata_updated = _update_citation_target_metadata_session(session, content, raw_metadata, parsed_metadata, curated_metadata, status, bibcode) return metadata_updated - def store_citation(app, citation_change, content_type, raw_metadata, parsed_metadata, status): """ Stores a new citation in the DB @@ -456,7 +455,7 @@ def mark_citation_as_sanitized(app, citing, content): session.add(citation) session.commit() -def mark_all_citations_as_sanitized(app, content): +def mark_all_discarded_citations_as_sanitized(app, content): """ Update status to SANITIZED for all discarded citations of a given content """ @@ -469,6 +468,19 @@ def mark_all_citations_as_sanitized(app, content): session.add(citation) session.commit() +def mark_all_sanitized_citations_as_discarded(app, content): + """ + Update status to DISCARDED for all sanitized citations of a given content + """ + marked_as_registered = False + previous_status = None + with app.session_scope() as session: + citations = session.query(Citation).with_for_update().filter_by(status='SANITIZED', content=content).all() + for citation in citations: + citation.status = 'SANITIZED' + session.add(citation) + session.commit() + def populate_bibcode_column(main_session, curated = True): """ Pulls all citation targets from DB and populates the bibcode column using parsed metadata From d50dbd724335bde7774ecb84a0c31b8da6d17922 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Fri, 15 Apr 2022 13:17:35 -0400 Subject: [PATCH 12/31] Added sanitization at process_new_citation back to codebase. Added second regex to catch malformed records of the form zenodo/123456. --- ADSCitationCapture/db.py | 9 ++------- ADSCitationCapture/doi.py | 19 +++++++++++++------ ADSCitationCapture/tasks.py | 18 +++++++++--------- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index 346dcaa..80a94c7 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -73,12 +73,7 @@ def _update_citation_target_metadata_session(session, content, raw_metadata, par if status == 'SANITIZED': #reset status but otherwise leave the citation target alone - citation_target.raw_cited_metadata = raw_metadata - citation_target.parsed_cited_metadata = parsed_metadata - citation_target.curated_metadata = curated_metadata - citation_target.bibcode = None - if status is not None: - citation_target.status = status + citation_target.status = status session.add(citation_target) session.commit() logger.info("Updated metadata for citation target '%s' (alternative bibcodes '%s')", content, ", ".join(parsed_metadata.get('alternate_bibcode', []))) @@ -445,7 +440,7 @@ def mark_all_discarded_citations_as_registered(app, content): def mark_citation_as_sanitized(app, citing, content): """ - Update status to SANITIZED for all discarded citations of a given content + Update status to SANITIZED for a single discarded citation """ marked_as_registered = False previous_status = None diff --git a/ADSCitationCapture/doi.py b/ADSCitationCapture/doi.py index e95f99e..a30ebd7 100644 --- a/ADSCitationCapture/doi.py +++ b/ADSCitationCapture/doi.py @@ -21,7 +21,6 @@ dc = DataCiteParser() zenodo_doi_re = re.compile(r"^10.\d{4,9}/zenodo\.([0-9]*)$", re.IGNORECASE) -zenodo_doi_reset = re.compile(r"10.\d{4,9}/zenodo\.([0-9]*)", re.IGNORECASE) upper_case_az_character_re = re.compile("[A-Z]") @@ -181,17 +180,25 @@ def sanitize_zenodo_doi(doi): """ Takes the imported citation_change content and tries to sanitize it if it is a zenodo doi. """ - return _sanitize_zendo_doi(zenodo_doi_reset, doi) + return _sanitize_zendo_doi(doi) -def _sanitize_zendo_doi(zenodo_doi_reset, doi): +def _sanitize_zendo_doi(doi): doi_root = '10.5281' + zenodo_doi_reset = re.compile(r"10.\d{4,9}/zenodo\.([0-9]*)", re.IGNORECASE) + zenodo_doi_reset_slash = re.compile(r"10.\d{4,9}/zenodo/([0-9]*)", re.IGNORECASE) try: #splits apart any conjoined dois and takes the first full one. spl_doi = doi_root + doi.split(doi_root)[1] return re.search(zenodo_doi_reset, spl_doi).group(0) - except: - logger.error("Unable to parse content: {}".format(doi)) - return None + except Exception as e: + logger.error("Attempt to parse content: {} failed with error: {}. Trying again with alternate regex.".format(doi, e)) + try: + spl_doi = doi_root + doi.split(doi_root)[1] + split = re.search(zenodo_doi_reset_slash, spl_doi).group(0).split('/') + return doi_root + "/" + "zenodo." + split[2] + except Exception as e: + logger.error("Attempt to parse content: {} failed with error: {}.".format(doi, e)) + return None def _parse_metadata_zenodo_doi(raw_metadata): """ diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index d785de6..5b883c3 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -49,15 +49,15 @@ def task_process_new_citation(citation_change, force=False): content_type = None is_link_alive = False status = "DISCARDED" - # if citation_change.content_type == adsmsg.CitationChangeContentType.doi \ - # and citation_change.content not in ["", None]: - # #attempts to sanitize the DOI to make it more likely to be valid - # clean_doi = doi.sanitize_zenodo_doi(citation_change.content) - # if clean_doi: - # logger.info("Replacing citation_change.content: {} with sanitized version: {}".format(citation_change.content, clean_doi)) - # citation_change.content = clean_doi - # else: - # logger.warn("Failed to sanitize DOI for {}".format(citation_change.content)) + if citation_change.content_type == adsmsg.CitationChangeContentType.doi \ + and citation_change.content not in ["", None]: + #attempts to sanitize the DOI to make it more likely to be valid + clean_doi = doi.sanitize_zenodo_doi(citation_change.content) + if clean_doi and clean_doi != citation_change.content: + logger.info("Replacing citation_change.content: {} with sanitized version: {}".format(citation_change.content, clean_doi)) + citation_change.content = clean_doi + elif not clean_doi: + logger.warn("Failed to sanitize DOI for {}".format(citation_change.content)) # Check if we already have the citation target in the DB metadata = db.get_citation_target_metadata(app, citation_change.content) From c276bc063cea682a56a5ef9b0ee8599e1ee84cfa Mon Sep 17 00:00:00 2001 From: tjacovich Date: Mon, 25 Apr 2022 16:12:58 -0400 Subject: [PATCH 13/31] Updated task_maintenance_curation to preserve original bibcode entry year. --- ADSCitationCapture/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index 83b4ae8..7ce6b45 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -571,6 +571,7 @@ def task_maintenance_curation(dois, bibcodes, curated_entries, reset = False): zenodo_bibstem = "zndo" #generates new bibcodes with manual curation data new_bibcode = doi.build_bibcode(modified_metadata, doi.zenodo_doi_re, zenodo_bibstem) + new_bibcode = registered_record['bibcode'][:4] + new_bibcode[4:] modified_metadata['bibcode'] = new_bibcode #get the original list of alt bibcodes alternate_bibcode = registered_record.get('alternate_bibcode', []) From ed5fdb3f0076a5ceddbf7e970d8039daae57ef90 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Mon, 25 Apr 2022 17:32:10 -0400 Subject: [PATCH 14/31] Added function to correct lowercase alternate bibcodes in parsed_metadata. Added check on populate_bibcode_column to make sure it is not run on a db with curated metadata already present. --- ADSCitationCapture/db.py | 42 ++++++++++++++++++- .../8e83568feb1d_add_sanitized_value.py | 4 ++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index 5e068e4..310cf1e 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -501,9 +501,33 @@ def populate_bibcode_column(main_session, curated = True): parsed_metadata = metadata.get('parsed', {}) curated_metadata = metadata.get('curated',{}) status = metadata.get('status', None) - _update_citation_target_metadata_alembic(main_session, content, raw_metadata, parsed_metadata, curated_metadata, status) + if not curated_metadata: + _update_citation_target_metadata_alembic(main_session, content, raw_metadata, parsed_metadata, curated_metadata, status) + else: + msg = "This command should never be run on a database with curated metadata. Stopping." + logger.error(msg) + raise Exception(msg) + +def correct_alternate_bibcodes(main_session, curated = False): + """ + Pulls all citation targets from DB and corrects any lowercase final letters in the alternate bibcodes. + """ + logger.debug("Collecting Citation Targets") + records = _get_citation_targets_alembic(main_session, only_status = None) + for record in records: + bibcode = record.get('bibcode', None) + content = record.get('content', None) + logger.debug("Collecting metadata for {}".format(record.get('content'))) + metadata = _get_citation_target_metadata_alembic(main_session, content, curate = curated) + if metadata: + logger.debug("Updating alternate_bibcode field for {}".format(record.get('content'))) + raw_metadata = metadata.get('raw', {}) + parsed_metadata = metadata.get('parsed', {}) + curated_metadata = metadata.get('curated',{}) + status = metadata.get('status', None) + _update_citation_target_metadata_alembic(main_session, content, raw_metadata, parsed_metadata, curated_metadata, status=status, bibcode=bibcode) -def _update_citation_target_metadata_alembic(session, content, raw_metadata, parsed_metadata, curated_metadata = {}, status=None, bibcode = None): +def _update_citation_target_metadata_alembic(session, content, raw_metadata, parsed_metadata, curated_metadata={}, status=None, bibcode=None): """ Update metadata for a citation target when we do not need to close the session after completion @@ -512,6 +536,20 @@ def _update_citation_target_metadata_alembic(session, content, raw_metadata, par if not bibcode: bibcode = parsed_metadata.get('bibcode', None) metadata_updated = _update_citation_target_metadata_session(session, content, raw_metadata, parsed_metadata, curated_metadata, status, bibcode) return metadata_updated + +def _update_citation_target_alt_bibcodes_alembic(session, content, raw_metadata, parsed_metadata, curated_metadata={}, status=None, bibcode=None): + """ + Correct alternate bibcode format for a citation target when we do not need to + close the session after completion + """ + metadata_updated = False + if not bibcode: bibcode = parsed_metadata.get('bibcode', None) + alt_bibcodes = parsed_metadata.get('alternate_bibcode', []) + if alt_bibcodes: + alt_bibcodes = [bib[:-1]+bib[-1].upper() for bib in alt_bibcodes] + parsed_metadata['alternate_bibcode'] = alt_bibcodes + metadata_updated = _update_citation_target_metadata_session(session, content, raw_metadata, parsed_metadata, curated_metadata, status, bibcode) + return metadata_updated def _get_citation_target_metadata_alembic(session, doi, curate=True): """ diff --git a/alembic/versions/8e83568feb1d_add_sanitized_value.py b/alembic/versions/8e83568feb1d_add_sanitized_value.py index b9656fb..9bb7ca6 100644 --- a/alembic/versions/8e83568feb1d_add_sanitized_value.py +++ b/alembic/versions/8e83568feb1d_add_sanitized_value.py @@ -7,6 +7,7 @@ """ from alembic import op import sqlalchemy as sa +from ADSCitationCapture import db from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. @@ -17,9 +18,12 @@ def upgrade(): + session = sa.orm.Session(bind=op.get_bind()) op.execute('COMMIT') op.execute("ALTER TYPE target_status_type ADD VALUE 'SANITIZED'") op.execute("ALTER TYPE citation_status_type ADD VALUE 'SANITIZED'") + #bugfix correct all parsed_metadata alt_bibcodes to be uppercase + db.correct_alternate_bibcodes(session) def downgrade(): From 9819b5f969fbea8e85d47a444b85fb924e5104d6 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Mon, 25 Apr 2022 17:38:36 -0400 Subject: [PATCH 15/31] updated populate_bibcode_column so that it won't break when run on a db that contains curated metadata. --- ADSCitationCapture/db.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index 310cf1e..63c3c12 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -485,7 +485,7 @@ def mark_all_sanitized_citations_as_discarded(app, content): session.add(citation) session.commit() -def populate_bibcode_column(main_session, curated = True): +def populate_bibcode_column(main_session, curated = False): """ Pulls all citation targets from DB and populates the bibcode column using parsed metadata """ @@ -493,20 +493,17 @@ def populate_bibcode_column(main_session, curated = True): records = _get_citation_targets_alembic(main_session, only_status = None) for record in records: content = record.get('content', None) + bibcode = record.get('bibcode', None) logger.debug("Collecting metadata for {}".format(record.get('content'))) metadata = _get_citation_target_metadata_alembic(main_session, content, curate = curated) if metadata: logger.debug("Updating Bibcode field for {}".format(record.get('content'))) raw_metadata = metadata.get('raw', {}) parsed_metadata = metadata.get('parsed', {}) - curated_metadata = metadata.get('curated',{}) + curated_metadata = metadata.get('curated', {}) status = metadata.get('status', None) - if not curated_metadata: - _update_citation_target_metadata_alembic(main_session, content, raw_metadata, parsed_metadata, curated_metadata, status) - else: - msg = "This command should never be run on a database with curated metadata. Stopping." - logger.error(msg) - raise Exception(msg) + _update_citation_target_metadata_alembic(main_session, content, raw_metadata, parsed_metadata, curated_metadata, status=status, bibcode=bibcode) + def correct_alternate_bibcodes(main_session, curated = False): """ @@ -525,7 +522,7 @@ def correct_alternate_bibcodes(main_session, curated = False): parsed_metadata = metadata.get('parsed', {}) curated_metadata = metadata.get('curated',{}) status = metadata.get('status', None) - _update_citation_target_metadata_alembic(main_session, content, raw_metadata, parsed_metadata, curated_metadata, status=status, bibcode=bibcode) + _update_citation_target_alt_bibcodes_alembic(main_session, content, raw_metadata, parsed_metadata, curated_metadata, status=status, bibcode=bibcode) def _update_citation_target_metadata_alembic(session, content, raw_metadata, parsed_metadata, curated_metadata={}, status=None, bibcode=None): """ @@ -533,7 +530,11 @@ def _update_citation_target_metadata_alembic(session, content, raw_metadata, par close the session after completion """ metadata_updated = False - if not bibcode: bibcode = parsed_metadata.get('bibcode', None) + if not curated_metadata: + modified_metadata = parsed_metadata + else: + modified_metadata = generate_modified_metadata(parsed_metadata, curated_metadata) + if not bibcode: bibcode = modified_metadata.get('bibcode', None) metadata_updated = _update_citation_target_metadata_session(session, content, raw_metadata, parsed_metadata, curated_metadata, status, bibcode) return metadata_updated From d61185d5a489a8cbebd3cd0dbbd5db2581e9c1c5 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Mon, 25 Apr 2022 18:15:29 -0400 Subject: [PATCH 16/31] Added catch for potentially broken entries when correcting alternate bibcodes. --- ADSCitationCapture/db.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index 63c3c12..8153262 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -504,7 +504,6 @@ def populate_bibcode_column(main_session, curated = False): status = metadata.get('status', None) _update_citation_target_metadata_alembic(main_session, content, raw_metadata, parsed_metadata, curated_metadata, status=status, bibcode=bibcode) - def correct_alternate_bibcodes(main_session, curated = False): """ Pulls all citation targets from DB and corrects any lowercase final letters in the alternate bibcodes. @@ -517,7 +516,7 @@ def correct_alternate_bibcodes(main_session, curated = False): logger.debug("Collecting metadata for {}".format(record.get('content'))) metadata = _get_citation_target_metadata_alembic(main_session, content, curate = curated) if metadata: - logger.debug("Updating alternate_bibcode field for {}".format(record.get('content'))) + logger.debug("Calling update alternate_bibcode field for {}".format(record.get('content'))) raw_metadata = metadata.get('raw', {}) parsed_metadata = metadata.get('parsed', {}) curated_metadata = metadata.get('curated',{}) @@ -544,7 +543,10 @@ def _update_citation_target_alt_bibcodes_alembic(session, content, raw_metadata, close the session after completion """ metadata_updated = False - if not bibcode: bibcode = parsed_metadata.get('bibcode', None) + if not bibcode: + msg = "bibcode should not be None. Please check entry for {}. Skipping.".format(content) + logger.warn(msg) + return metadata_updated alt_bibcodes = parsed_metadata.get('alternate_bibcode', []) if alt_bibcodes: alt_bibcodes = [bib[:-1]+bib[-1].upper() for bib in alt_bibcodes] From 7b9a67381aa47322c54a1db3dc374f36c6cfd12f Mon Sep 17 00:00:00 2001 From: tjacovich Date: Mon, 25 Apr 2022 18:19:08 -0400 Subject: [PATCH 17/31] updated json nargs. --- run.py | 1 + 1 file changed, 1 insertion(+) diff --git a/run.py b/run.py index d1805d7..19c58f5 100755 --- a/run.py +++ b/run.py @@ -303,6 +303,7 @@ def _build_diagnostics(bibcodes=None, json_payloads=None): '--json', dest='json_payload', action='store', + nargs='+', default=None, help='Space delimited list of json curated metadata') maintenance_parser.add_argument('--input_filename', From a258cbdde5ce1319325ca11d978e5b6f86be4617 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 27 Apr 2022 09:46:16 -0400 Subject: [PATCH 18/31] Added unit tests for doi sanitization. --- ADSCitationCapture/tests/test_doi.py | 15 +++++ ADSCitationCapture/tests/test_tasks.py | 77 ++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/ADSCitationCapture/tests/test_doi.py b/ADSCitationCapture/tests/test_doi.py index 5099478..ec659fd 100644 --- a/ADSCitationCapture/tests/test_doi.py +++ b/ADSCitationCapture/tests/test_doi.py @@ -30,6 +30,21 @@ def test_software_doi(self): httpretty.disable() httpretty.reset() # clean up registered urls and request history + def test_sanitize_doi_trailing_characters(self): + unsanitized_id = "10.5281/zenodo.11020__amp__quot;__amp__gt" # software + doi_id = "10.5281/zenodo.11020" # software + self.assertEqual(doi.sanitize_zenodo_doi(unsanitized_id), doi_id) + + def test_sanitize_doi_conjoined_dois(self): + unsanitized_id = "10.5281/zenodo.1102010.5281/zenodo.11020" # software + doi_id = "10.5281/zenodo.11020" # software + self.assertEqual(doi.sanitize_zenodo_doi(unsanitized_id), doi_id) + + def test_sanitize_doi_slash_in_dois(self): + unsanitized_id = "10.5281/zenodo/11020" # software + doi_id = "10.5281/zenodo.11020" # software + self.assertEqual(doi.sanitize_zenodo_doi(unsanitized_id), doi_id) + def test_non_software_doi(self): doi_id = "10.1016/j.dsr2.2008.10.030" # Not software expected_response_content = '' diff --git a/ADSCitationCapture/tests/test_tasks.py b/ADSCitationCapture/tests/test_tasks.py index 1ad27a1..a9f803a 100644 --- a/ADSCitationCapture/tests/test_tasks.py +++ b/ADSCitationCapture/tests/test_tasks.py @@ -32,6 +32,17 @@ def _common_citation_changes_doi(self, status): citation_change.resolved = False citation_change.status = status return citation_changes + + def _common_citation_changes_unsanitized_doi(self, status): + citation_changes = adsmsg.CitationChanges() + citation_change = citation_changes.changes.add() + citation_change.citing = '2005CaJES..42.1987P' + citation_change.cited = '...................' + citation_change.content = '10.5281/zenodo.11020__amp__quot;__amp__gt' + citation_change.content_type = adsmsg.CitationChangeContentType.doi + citation_change.resolved = False + citation_change.status = status + return citation_changes def test_process_new_citation_changes_doi(self): @@ -98,6 +109,71 @@ def test_process_new_citation_changes_doi(self): self.assertTrue(mocked['webhook_dump_event'].called) self.assertTrue(mocked['webhook_emit_event'].called) + def test_process_new_citation_changes_sanitized_doi(self): + citation_changes = self._common_citation_changes_unsanitized_doi(adsmsg.Status.new) + doi_id = "10.5281/zenodo.11020" #Software + with TestBase.mock_multiple_targets({ + 'sanitize_zenodo_doi': patch.object(doi, 'sanitize_zenodo_doi', wraps=doi.sanitize_zenodo_doi), \ + 'citation_already_exists': patch.object(db, 'citation_already_exists', return_value=False), \ + 'get_citation_target_metadata': patch.object(db, 'get_citation_target_metadata', return_value={}), \ + 'get_citations_by_bibcode': patch.object(db, 'get_citations_by_bibcode', return_value=[]), \ + 'store_citation_target': patch.object(db, 'store_citation_target', return_value=True), \ + 'store_citation': patch.object(db, 'store_citation', return_value=True), \ + 'store_event': patch.object(db, 'store_event', return_value=True), \ + 'update_citation': patch.object(db, 'update_citation', return_value=True), \ + 'mark_citation_as_deleted': patch.object(db, 'mark_citation_as_deleted', return_value=(True, 'REGISTERED')), \ + 'get_citations': patch.object(db, 'get_citations', return_value=[]), \ + 'update_citation_target_metadata': patch.object(db, 'update_citation_target_metadata', return_value=True), \ + 'get_citation_target_count': patch.object(db, 'get_citation_target_count', return_value=0), \ + 'get_citation_count': patch.object(db, 'get_citation_count', return_value=0), \ + 'get_citation_targets_by_bibcode': patch.object(db, 'get_citation_targets_by_bibcode', return_value=[]), \ + 'get_citation_targets_by_doi': patch.object(db, 'get_citation_targets_by_doi', return_value=[]), \ + 'get_citation_targets': patch.object(db, 'get_citation_targets', return_value=[]), \ + 'get_canonical_bibcode': patch.object(api, 'get_canonical_bibcode', return_value=citation_changes.changes[0].citing), \ + 'get_canonical_bibcodes': patch.object(api, 'get_canonical_bibcodes', return_value=[]), \ + 'request_existing_citations': patch.object(api, 'request_existing_citations', return_value=[]), \ + 'fetch_metadata': patch.object(doi, 'fetch_metadata', return_value=self.mock_data[doi_id]['raw']), \ + 'parse_metadata': patch.object(doi, 'parse_metadata', return_value=self.mock_data[doi_id]['parsed']), \ + 'build_bibcode': patch.object(doi, 'build_bibcode', wraps=doi.build_bibcode), \ + 'url_is_alive': patch.object(url, 'is_alive', return_value=True), \ + 'is_url': patch.object(url, 'is_url', wraps=url.is_url), \ + 'citation_change_to_event_data': patch.object(webhook, 'citation_change_to_event_data', wraps=webhook.citation_change_to_event_data), \ + 'identical_bibcodes_event_data': patch.object(webhook, 'identical_bibcodes_event_data', wraps=webhook.identical_bibcodes_event_data), \ + 'identical_bibcode_and_doi_event_data': patch.object(webhook, 'identical_bibcode_and_doi_event_data', wraps=webhook.identical_bibcode_and_doi_event_data), \ + 'webhook_dump_event': patch.object(webhook, 'dump_event', return_value=True), \ + 'webhook_emit_event': patch.object(webhook, 'emit_event', return_value=True), \ + 'forward_message': patch.object(app.ADSCitationCaptureCelery, 'forward_message', return_value=True)}) as mocked: + tasks.task_process_citation_changes(citation_changes) + self.assertTrue(mocked['sanitize_zenodo_doi'].called) + self.assertTrue(mocked['citation_already_exists'].called) + self.assertTrue(mocked['get_citation_target_metadata'].called) + self.assertTrue(mocked['fetch_metadata'].called) + self.assertTrue(mocked['parse_metadata'].called) + self.assertFalse(mocked['url_is_alive'].called) + self.assertTrue(mocked['get_canonical_bibcode'].called) + self.assertTrue(mocked['get_canonical_bibcodes'].called) + self.assertTrue(mocked['get_citations_by_bibcode'].called) + self.assertTrue(mocked['store_citation_target'].called) + self.assertTrue(mocked['store_citation'].called) + self.assertFalse(mocked['update_citation'].called) + self.assertFalse(mocked['mark_citation_as_deleted'].called) + self.assertFalse(mocked['get_citations'].called) + self.assertTrue(mocked['forward_message'].called) + self.assertFalse(mocked['update_citation_target_metadata'].called) + self.assertFalse(mocked['get_citation_target_count'].called) + self.assertFalse(mocked['get_citation_count'].called) + self.assertFalse(mocked['get_citation_targets_by_bibcode'].called) + self.assertFalse(mocked['get_citation_targets_by_doi'].called) + self.assertFalse(mocked['get_citation_targets'].called) + self.assertFalse(mocked['request_existing_citations'].called) + self.assertFalse(mocked['build_bibcode'].called) + self.assertFalse(mocked['is_url'].called) + self.assertTrue(mocked['citation_change_to_event_data'].called) + self.assertFalse(mocked['identical_bibcodes_event_data'].called) + self.assertTrue(mocked['identical_bibcode_and_doi_event_data'].called) + self.assertTrue(mocked['store_event'].called) + self.assertTrue(mocked['webhook_dump_event'].called) + self.assertTrue(mocked['webhook_emit_event'].called) def test_process_updated_citation_changes_doi(self): citation_changes = self._common_citation_changes_doi(adsmsg.Status.updated) @@ -752,6 +828,7 @@ def test_process_citation_changes_url(self): self.assertTrue(mocked['webhook_emit_event'].called) # because we don't know if an URL is software + def test_process_citation_changes_malformed_url(self): citation_changes = adsmsg.CitationChanges() citation_change = citation_changes.changes.add() From 11bce8e459f68e0163de0f7602f1a385140c2c95 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Mon, 2 May 2022 14:39:01 -0400 Subject: [PATCH 19/31] Fixed alembic revisions to presever ENUM column values on downgrade. --- ADSCitationCapture/tasks.py | 2 +- alembic/versions/0b6a01b03d4d_urls.py | 35 +++++++----- .../8e83568feb1d_add_sanitized_value.py | 53 +++++++++++-------- 3 files changed, 54 insertions(+), 36 deletions(-) diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index 7ce6b45..83a81c8 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -895,11 +895,11 @@ def task_maintenance_reevaluate(dois, bibcodes): new_citation_change = db.citation_data_to_citation_change(citation_data, previously_discarded_record) try: db.store_citation(app, new_citation_change, new_citation_change.content_type, raw_metadata, parsed_metadata, status = 'REGISTERED') + #mark old citation as SANITIZED db.mark_citation_as_sanitized(app, cite, previously_discarded_record['content']) except Exception as e: logger.error("Failed to update citation from {} to {} with error {}. Skipping.".format(cite, clean_doi, e)) - #mark old citation as SANITIZED #Update the citation target if the content hasn't changed. else: diff --git a/alembic/versions/0b6a01b03d4d_urls.py b/alembic/versions/0b6a01b03d4d_urls.py index a85215f..875bd00 100644 --- a/alembic/versions/0b6a01b03d4d_urls.py +++ b/alembic/versions/0b6a01b03d4d_urls.py @@ -25,25 +25,32 @@ def upgrade(): def downgrade(): #Move expanded status types to old op.execute("ALTER TYPE target_status_type RENAME TO target_status_type_old") - op.execute("CREATE TYPE target_status_type AS ENUM('REGISTERED', 'DELETED','UPDATED')") + op.execute("CREATE TYPE target_status_type AS ENUM('REGISTERED','DISCARDED','DELETED','UPDATED')") #instantiate original status types op.execute("ALTER TYPE citation_status_type RENAME TO citation_status_type_old") - op.execute("CREATE TYPE citation_status_type AS ENUM('REGISTERED', 'DELETED','UPDATED')") + op.execute("CREATE TYPE citation_status_type AS ENUM('REGISTERED','DISCARDED','DELETED','UPDATED')") - #DROP expanded status columns - op.drop_column('citation_target_version','status') - op.drop_column('citation_target','status') - op.drop_column('citation_version','status') - op.drop_column('citation','status') + + def pgsql_change_type(table_name, column_name, new_enum): + return f"ALTER TABLE {table_name} \ + ALTER COLUMN {column_name} \ + SET DATA TYPE {new_enum} \ + USING ( \ + CASE {column_name}::text \ + WHEN 'EMITTABLE' THEN 'NULL' \ + ELSE {column_name}::text \ + END \ + )::{new_enum}" + + #ALTER column types to original ENUM type. + op.execute(pgsql_change_type('citation_target', 'status', 'target_status_type')) + op.execute(pgsql_change_type('citation_target_version', 'status', 'target_status_type')) + op.execute(pgsql_change_type('citation', 'status', 'citation_status_type')) + op.execute(pgsql_change_type('citation_version', 'status', 'citation_status_type')) + #DROP old ENUM types op.execute("DROP TYPE target_status_type_old") op.execute("DROP TYPE citation_status_type_old") - - #ADD original status columns - op.add_column('citation_target',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', name='target_status_type'), nullable=True)) - op.add_column('citation_target_version',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', name='target_status_type'), nullable=True)) - op.add_column('citation_version',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', name='citation_status_type'), nullable=True)) - op.add_column('citation',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', name='citation_status_type'), nullable=True)) - + \ No newline at end of file diff --git a/alembic/versions/8e83568feb1d_add_sanitized_value.py b/alembic/versions/8e83568feb1d_add_sanitized_value.py index 9bb7ca6..3d97132 100644 --- a/alembic/versions/8e83568feb1d_add_sanitized_value.py +++ b/alembic/versions/8e83568feb1d_add_sanitized_value.py @@ -18,37 +18,48 @@ def upgrade(): - session = sa.orm.Session(bind=op.get_bind()) - op.execute('COMMIT') + connection = None + if not op.get_context().as_sql: + """ + ALTERING ENUM values cannot be done from transaction blocks. + Changing the isolation_level to autocommit puts the specific calls in individually, preventing the issue. + Alembic will warn you about changing isolation level because it has already opened a transaction which is committed. + """ + connection = op.get_bind() + connection.execution_options(isolation_level='AUTOCOMMIT') + op.execute("ALTER TYPE target_status_type ADD VALUE 'SANITIZED'") op.execute("ALTER TYPE citation_status_type ADD VALUE 'SANITIZED'") - #bugfix correct all parsed_metadata alt_bibcodes to be uppercase - db.correct_alternate_bibcodes(session) def downgrade(): #Move expanded status types to old op.execute("ALTER TYPE target_status_type RENAME TO target_status_type_old") - op.execute("CREATE TYPE target_status_type AS ENUM('REGISTERED', 'DELETED', 'UPDATED', 'EMITTABLE')") + op.execute("CREATE TYPE target_status_type AS ENUM('REGISTERED', 'DELETED', 'DISCARDED', 'UPDATED', 'EMITTABLE')") #instantiate original status types op.execute("ALTER TYPE citation_status_type RENAME TO citation_status_type_old") - op.execute("CREATE TYPE citation_status_type AS ENUM('REGISTERED', 'DELETED', 'UPDATED', 'EMITTABLE')") - - #DROP expanded status columns - op.drop_column('citation_target_version','status') - op.drop_column('citation_target','status') - op.drop_column('citation_version','status') - op.drop_column('citation','status') - - #DROP old ENUM types - op.execute("DROP TYPE target_status_type_old") - op.execute("DROP TYPE citation_status_type_old") + op.execute("CREATE TYPE citation_status_type AS ENUM('REGISTERED', 'DELETED', 'DISCARDED', 'UPDATED', 'EMITTABLE')") - #ADD original status columns - op.add_column('citation_target',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', 'EMITTABLE', name='target_status_type'), nullable=True)) - op.add_column('citation_target_version',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', 'EMITTABLE', name='target_status_type'), nullable=True)) - op.add_column('citation_version',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', 'EMITTABLE', name='citation_status_type'), nullable=True)) - op.add_column('citation',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', 'EMITTABLE', name='citation_status_type'), nullable=True)) + def pgsql_change_type(table_name, column_name, new_enum): + return f"ALTER TABLE {table_name} \ + ALTER COLUMN {column_name} \ + SET DATA TYPE {new_enum} \ + USING ( \ + CASE {column_name}::text \ + WHEN 'SANITIZED' THEN 'DISCARDED' \ + ELSE {column_name}::text \ + END \ + )::{new_enum}" + + #Reset to original ENUM type + op.execute(pgsql_change_type('citation_target', 'status', 'target_status_type')) + op.execute(pgsql_change_type('citation_target_version', 'status', 'target_status_type')) + op.execute(pgsql_change_type('citation', 'status', 'citation_status_type')) + op.execute(pgsql_change_type('citation_version', 'status', 'citation_status_type')) + #DROP old (SANITIZED) ENUM types + op.execute("DROP TYPE target_status_type_old") + op.execute("DROP TYPE citation_status_type_old") + \ No newline at end of file From 0e86df6fe22c1451ffb848a032e6490edd1af2a9 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Mon, 2 May 2022 14:41:16 -0400 Subject: [PATCH 20/31] Removed alt_bibcode bugfix from branch. --- ADSCitationCapture/db.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index 8153262..988dc29 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -504,25 +504,6 @@ def populate_bibcode_column(main_session, curated = False): status = metadata.get('status', None) _update_citation_target_metadata_alembic(main_session, content, raw_metadata, parsed_metadata, curated_metadata, status=status, bibcode=bibcode) -def correct_alternate_bibcodes(main_session, curated = False): - """ - Pulls all citation targets from DB and corrects any lowercase final letters in the alternate bibcodes. - """ - logger.debug("Collecting Citation Targets") - records = _get_citation_targets_alembic(main_session, only_status = None) - for record in records: - bibcode = record.get('bibcode', None) - content = record.get('content', None) - logger.debug("Collecting metadata for {}".format(record.get('content'))) - metadata = _get_citation_target_metadata_alembic(main_session, content, curate = curated) - if metadata: - logger.debug("Calling update alternate_bibcode field for {}".format(record.get('content'))) - raw_metadata = metadata.get('raw', {}) - parsed_metadata = metadata.get('parsed', {}) - curated_metadata = metadata.get('curated',{}) - status = metadata.get('status', None) - _update_citation_target_alt_bibcodes_alembic(main_session, content, raw_metadata, parsed_metadata, curated_metadata, status=status, bibcode=bibcode) - def _update_citation_target_metadata_alembic(session, content, raw_metadata, parsed_metadata, curated_metadata={}, status=None, bibcode=None): """ Update metadata for a citation target when we do not need to From 941615bcd32f21d2b59943e37fbcd2c0a457a826 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Mon, 2 May 2022 14:45:01 -0400 Subject: [PATCH 21/31] Removed secondary functions related to alt bibcode bugfix. --- ADSCitationCapture/db.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index 988dc29..f7741cc 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -517,23 +517,6 @@ def _update_citation_target_metadata_alembic(session, content, raw_metadata, par if not bibcode: bibcode = modified_metadata.get('bibcode', None) metadata_updated = _update_citation_target_metadata_session(session, content, raw_metadata, parsed_metadata, curated_metadata, status, bibcode) return metadata_updated - -def _update_citation_target_alt_bibcodes_alembic(session, content, raw_metadata, parsed_metadata, curated_metadata={}, status=None, bibcode=None): - """ - Correct alternate bibcode format for a citation target when we do not need to - close the session after completion - """ - metadata_updated = False - if not bibcode: - msg = "bibcode should not be None. Please check entry for {}. Skipping.".format(content) - logger.warn(msg) - return metadata_updated - alt_bibcodes = parsed_metadata.get('alternate_bibcode', []) - if alt_bibcodes: - alt_bibcodes = [bib[:-1]+bib[-1].upper() for bib in alt_bibcodes] - parsed_metadata['alternate_bibcode'] = alt_bibcodes - metadata_updated = _update_citation_target_metadata_session(session, content, raw_metadata, parsed_metadata, curated_metadata, status, bibcode) - return metadata_updated def _get_citation_target_metadata_alembic(session, doi, curate=True): """ From 7bd0568f2062bbe260704539004cccdcc601a1d8 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Tue, 3 May 2022 10:53:26 -0400 Subject: [PATCH 22/31] Merged divergent alembic heads for lowercase alt bibcodes fix and doi sanitization. --- ...2_alt_bibcodes_fix_and_doi_sanitization.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 alembic/versions/02d38ac44872_alt_bibcodes_fix_and_doi_sanitization.py diff --git a/alembic/versions/02d38ac44872_alt_bibcodes_fix_and_doi_sanitization.py b/alembic/versions/02d38ac44872_alt_bibcodes_fix_and_doi_sanitization.py new file mode 100644 index 0000000..7b7b2b1 --- /dev/null +++ b/alembic/versions/02d38ac44872_alt_bibcodes_fix_and_doi_sanitization.py @@ -0,0 +1,24 @@ +"""alt_bibcodes_fix_and_doi_sanitization + +Revision ID: 02d38ac44872 +Revises: 8e83568feb1d, fae6c4a0716e +Create Date: 2022-05-03 10:50:10.250392 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '02d38ac44872' +down_revision = ('8e83568feb1d', 'fae6c4a0716e') +branch_labels = None +depends_on = None + + +def upgrade(): + pass + + +def downgrade(): + pass From d7d2871fec6da9a774c3ce6f82aad027f850e658 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Mon, 16 May 2022 10:42:56 -0400 Subject: [PATCH 23/31] Merged alembic heads between doi sanitization and master branch. --- ...2_alt_bibcodes_fix_and_doi_sanitization.py | 24 ------------------- ...d06eee98a8a_doi_sanitize_and_associated.py | 24 +++++++++++++++++++ 2 files changed, 24 insertions(+), 24 deletions(-) delete mode 100644 alembic/versions/02d38ac44872_alt_bibcodes_fix_and_doi_sanitization.py create mode 100644 alembic/versions/cd06eee98a8a_doi_sanitize_and_associated.py diff --git a/alembic/versions/02d38ac44872_alt_bibcodes_fix_and_doi_sanitization.py b/alembic/versions/02d38ac44872_alt_bibcodes_fix_and_doi_sanitization.py deleted file mode 100644 index 7b7b2b1..0000000 --- a/alembic/versions/02d38ac44872_alt_bibcodes_fix_and_doi_sanitization.py +++ /dev/null @@ -1,24 +0,0 @@ -"""alt_bibcodes_fix_and_doi_sanitization - -Revision ID: 02d38ac44872 -Revises: 8e83568feb1d, fae6c4a0716e -Create Date: 2022-05-03 10:50:10.250392 - -""" -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision = '02d38ac44872' -down_revision = ('8e83568feb1d', 'fae6c4a0716e') -branch_labels = None -depends_on = None - - -def upgrade(): - pass - - -def downgrade(): - pass diff --git a/alembic/versions/cd06eee98a8a_doi_sanitize_and_associated.py b/alembic/versions/cd06eee98a8a_doi_sanitize_and_associated.py new file mode 100644 index 0000000..6cd75f8 --- /dev/null +++ b/alembic/versions/cd06eee98a8a_doi_sanitize_and_associated.py @@ -0,0 +1,24 @@ +"""doi_sanitize_and_associated + +Revision ID: cd06eee98a8a +Revises: 8e83568feb1d, 98cb3e36b2da +Create Date: 2022-05-16 10:35:34.117170 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'cd06eee98a8a' +down_revision = ('8e83568feb1d', '98cb3e36b2da') +branch_labels = None +depends_on = None + + +def upgrade(): + pass + + +def downgrade(): + pass From caaecd1745879c83483d763eea789ed3dfb1a215 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 18 May 2022 12:14:59 -0400 Subject: [PATCH 24/31] Added bugfix for datetime is only year issue. --- ADSCitationCapture/forward.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/ADSCitationCapture/forward.py b/ADSCitationCapture/forward.py index e2682d9..79c9c90 100644 --- a/ADSCitationCapture/forward.py +++ b/ADSCitationCapture/forward.py @@ -39,6 +39,19 @@ def build_record(app, citation_change, parsed_metadata, citations, db_versions, normalized_authors = parsed_metadata.get('normalized_authors', []) affiliations = parsed_metadata.get('affiliations', ['-']*len(authors)) pubdate = parsed_metadata.get('pubdate', get_date().strftime("%Y-%m-%d")) + try: + solr_date=(datetime.datetime.strptime(pubdate, "%Y-%m-%d")+datetime.timedelta(minutes=30)).strftime('%Y-%m-%dT%H:%M:%S.%fZ') + except: + try: + #In the event only a year is specified, the date is assumed to be January 1st of the given year. + logger.warn("Publication date does not conform to Y-m-d format. Assuming only year is specified.") + pubdate = pubdate+"-01"+"-01" + solr_date=(datetime.datetime.strptime(pubdate, "%Y-%m-%d")+datetime.timedelta(minutes=30)).strftime('%Y-%m-%dT%H:%M:%S.%fZ') + except: + #If above fails, just set it to the current date. Running maintenance_metadata could fix the bad publication date in the future if it is updated upstream. + logger.warn("Cannot parse publication date. Setting to current datetime.") + solr_date=date2solrstamp(entry_date) + source = parsed_metadata.get('source', "Unknown") version = parsed_metadata.get('version', "") doctype = parsed_metadata.get('doctype', "software") @@ -79,7 +92,7 @@ def build_record(app, citation_change, parsed_metadata, citations, db_versions, 'database': ['general', 'astronomy'], 'entry_date': date2solrstamp(entry_date), # date2solrstamp(get_date()), 'year': year, - 'date': (datetime.datetime.strptime(pubdate, "%Y-%m-%d")+datetime.timedelta(minutes=30)).strftime('%Y-%m-%dT%H:%M:%S.%fZ'), # TODO: Why this date has to be 30 minutes in advance? This is based on ADSImportPipeline SolrAdapter + 'date': solr_date, # TODO: Why this date has to be 30 minutes in advance? This is based on ADSImportPipeline SolrAdapter 'doctype': doctype, 'doctype_facet_hier': ["0/Non-Article", "1/Non-Article/Software"], 'doi': [doi], From 8c622da0ce15dfa7b6336a216f013864ea05d693 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Thu, 19 May 2022 20:16:02 -0400 Subject: [PATCH 25/31] minor tweak to tasks.py --- ADSCitationCapture/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index 853b9f1..561b1e4 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -288,7 +288,7 @@ def task_process_updated_associated_works(citation_change, associated_versions, original_citations = db.get_citations_by_bibcode(app, citation_target_bibcode) citations = api.get_canonical_bibcodes(app, original_citations) logger.debug("Calling 'task_output_results' with '%s'", citation_change) - task_output_results.delay(citation_change, parsed_metadata, citations, db_versions=associated_versions) + task_output_results.delay(citation_change, parsed_metadata, citations, db_versions=no_self_ref_versions) logger.info("Updating associated works for %s", citation_change.content) db.update_citation_target_metadata(app, citation_change.content, raw_metadata, parsed_metadata, curated_metadata=curated_metadata, associated=no_self_ref_versions, bibcode=citation_target_bibcode) From 6128e8a41ff05cbd4ee070b1e9400e0e5722221f Mon Sep 17 00:00:00 2001 From: tjacovich Date: Tue, 24 May 2022 09:10:20 -0400 Subject: [PATCH 26/31] minor tweak to forward.py that keeps 'ASSOCIATED' property from being applied to software with versions but none that exist in ADS. --- ADSCitationCapture/forward.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ADSCitationCapture/forward.py b/ADSCitationCapture/forward.py index 2f6ae99..4e52518 100644 --- a/ADSCitationCapture/forward.py +++ b/ADSCitationCapture/forward.py @@ -142,7 +142,7 @@ def build_record(app, citation_change, parsed_metadata, citations, db_versions, record_dict['status'] = status else: status = 0 # active - if db_versions not in [{"":""}, None]: + if db_versions not in [{"":""}, {}, None]: record_dict['property'].append('ASSOCIATED') if is_release: record_dict['property'].append('RELEASE') @@ -173,7 +173,7 @@ def _build_nonbib_record(app, citation_change, record, db_versions, status): 'simbad_objects': [], 'total_link_counts': 0 # Only used for DATA and not for ESOURCES } - if db_versions not in [{"":""}, None]: + if db_versions not in [{"":""}, {}, None]: nonbib_record_dict['data_links_rows'].append({'link_type': 'ASSOCIATED', 'link_sub_type': '', 'url': db_versions.values(), 'title': db_versions.keys(), 'item_count':0}) nonbib_record = NonBibRecord(**nonbib_record_dict) From f2ab156cc7625f763041f75ecd4b23980b20a826 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 24 Aug 2022 11:55:08 -0400 Subject: [PATCH 27/31] Added latest version to test_base mock_data for 10.5281/zenodo.4475376 --- ADSCitationCapture/tests/test_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ADSCitationCapture/tests/test_base.py b/ADSCitationCapture/tests/test_base.py index 901dfd6..7346608 100644 --- a/ADSCitationCapture/tests/test_base.py +++ b/ADSCitationCapture/tests/test_base.py @@ -296,7 +296,8 @@ def _init_mock_data(self): "10.5281/zenodo.5545068", "10.5281/zenodo.5706396", "10.5281/zenodo.5773480", - "10.5281/zenodo.6513224" + "10.5281/zenodo.6513224", + "10.5281/zenodo.6982547" ]}, 'associated': {"Version v2.0.0": "2017zndo....248351D"} } From fd32c439161300934511e1ea7a42b18824672a32 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 24 Aug 2022 14:26:03 -0400 Subject: [PATCH 28/31] Added raw_content column to citation model. Modified recreate_previous_expanded_raw_data() to look at raw_content column instead of content column for reconstruction. --- ADSCitationCapture/db.py | 3 ++- ADSCitationCapture/delta_computation.py | 4 +++- ADSCitationCapture/models.py | 1 + ADSCitationCapture/tasks.py | 14 +++++++++----- .../versions/8e83568feb1d_add_sanitized_value.py | 5 ++++- 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index 5fd776c..30e03b9 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -125,7 +125,7 @@ def update_citation_target_curator_message(app, content, msg): msg_updated = _update_citation_target_curator_message_session(session, content, msg) return msg_updated -def store_citation(app, citation_change, content_type, raw_metadata, parsed_metadata, status): +def store_citation(app, citation_change, raw_content, content_type, raw_metadata, parsed_metadata, status): """ Stores a new citation in the DB """ @@ -135,6 +135,7 @@ def store_citation(app, citation_change, content_type, raw_metadata, parsed_meta citation.citing = citation_change.citing citation.cited = citation_change.cited citation.content = citation_change.content + citation.raw_content = raw_content citation.resolved = citation_change.resolved citation.timestamp = citation_change.timestamp.ToDatetime().replace(tzinfo=tzutc()) citation.status = status diff --git a/ADSCitationCapture/delta_computation.py b/ADSCitationCapture/delta_computation.py index 172c933..f4b3f3f 100644 --- a/ADSCitationCapture/delta_computation.py +++ b/ADSCitationCapture/delta_computation.py @@ -191,8 +191,10 @@ def _reconstruct_previous_expanded_raw_data(self): # Reconstruct expanded raw table from the official citation table drop_reconstructed_previous_expanded_table = "DROP TABLE IF EXISTS {0}.{1};" self._execute_sql(drop_reconstructed_previous_expanded_table, self.previous_schema_name, self.recreated_previous_expanded_table_name) - reconstruct_previous_expanded_table = "CREATE TABLE {0}.{1} AS SELECT id, citing, cited, CASE WHEN citation_target.content_type = 'DOI' THEN true ELSE false END AS doi, CASE WHEN citation_target.content_type = 'PID' THEN true ELSE false END AS pid, CASE WHEN citation_target.content_type = 'URL' THEN true ELSE false END AS url, citation.content, citation.resolved, citation.timestamp FROM citation INNER JOIN citation_target ON citation.content = citation_target.content WHERE citation.status != 'DELETED';" + reconstruct_previous_expanded_table = "CREATE TABLE {0}.{1} AS SELECT id, citing, cited, CASE WHEN citation_target.content_type = 'DOI' THEN true ELSE false END AS doi, CASE WHEN citation_target.content_type = 'PID' THEN true ELSE false END AS pid, CASE WHEN citation_target.content_type = 'URL' THEN true ELSE false END AS url, citation.raw_content, citation.resolved, citation.timestamp FROM citation INNER JOIN citation_target ON citation.content = citation_target.content WHERE citation.status != 'DELETED';" self._execute_sql(reconstruct_previous_expanded_table, self.previous_schema_name, self.recreated_previous_expanded_table_name) + rename_raw_content_column_previous_expanded_table = "ALTER TABLE {0}.{1} RENAME COLUMN raw_content TO content" + self._execute_sql(rename_raw_content_column_previous_expanded_table, self.previous_schema_name, self.recreated_previous_expanded_table_name) def _find_not_processed_records_from_previous_run(self): """ diff --git a/ADSCitationCapture/models.py b/ADSCitationCapture/models.py index 44c96d6..cf42f2b 100644 --- a/ADSCitationCapture/models.py +++ b/ADSCitationCapture/models.py @@ -53,6 +53,7 @@ class Citation(Base): ) __versioned__ = {} # Must be added to all models that are to be versioned id = Column(Integer, primary_key=True) + raw_content = Column(Text()) content = Column(Text(), ForeignKey('public.citation_target.content')) citing = Column(Text()) # Bibcode of the article that is citing a target cited = Column(Text()) # Probably not necessary to keep diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index df2f520..17964c4 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -54,11 +54,13 @@ def task_process_new_citation(citation_change, force=False): and citation_change.content not in ["", None]: #attempts to sanitize the DOI to make it more likely to be valid clean_doi = doi.sanitize_zenodo_doi(citation_change.content) - if clean_doi and clean_doi != citation_change.content: + if clean_doi: logger.info("Replacing citation_change.content: {} with sanitized version: {}".format(citation_change.content, clean_doi)) + raw_content = citation_change.content citation_change.content = clean_doi elif not clean_doi: logger.warn("Failed to sanitize DOI for {}".format(citation_change.content)) + raw_content = citation_change.content # Check if we already have the citation target in the DB metadata = db.get_citation_target_metadata(app, citation_change.content) @@ -157,7 +159,7 @@ def task_process_new_citation(citation_change, force=False): _emit_citation_change(citation_change, parsed_metadata) # Store the citation at the very end, so that if an exception is raised before # this task can be re-run in the future without key collisions in the database - stored = db.store_citation(app, citation_change, content_type, raw_metadata, parsed_metadata, status) + stored = db.store_citation(app, citation_change, raw_content, content_type, raw_metadata, parsed_metadata, status) @app.task(queue='process-github-urls', rate_limit=github_api_limit) def task_process_github_urls(citation_change, metadata): @@ -199,7 +201,7 @@ def task_process_github_urls(citation_change, metadata): _emit_citation_change(citation_change, parsed_metadata) # Store the citation at the very end, so that if an exception is raised before # this task can be re-run in the future without key collisions in the database - stored = db.store_citation(app, citation_change, content_type, raw_metadata, parsed_metadata, status) + stored = db.store_citation(app, citation_change, citation_change.content, content_type, raw_metadata, parsed_metadata, status) @app.task(queue='process-updated-citation') def task_process_updated_citation(citation_change, force=False): @@ -946,11 +948,13 @@ def task_maintenance_reevaluate(dois, bibcodes): clean_doi = doi.sanitize_zenodo_doi(previously_discarded_record.get('content')) if clean_doi: if clean_doi != previously_discarded_record.get('content'): + raw_content = previously_discarded_record.get('content') logger.info("Replacing citation_change.content: {} with sanitized version: {}".format(previously_discarded_record.get('content'), clean_doi)) else: logger.warn("Failed to sanitize DOI for {}".format(previously_discarded_record.get('content'))) clean_doi = previously_discarded_record.get('content') - + raw_content = previously_discarded_record.get('content') + #Fetch metadata and process raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], clean_doi) if raw_metadata: @@ -999,7 +1003,7 @@ def task_maintenance_reevaluate(dois, bibcodes): #store new citation new_citation_change = db.citation_data_to_citation_change(citation_data, previously_discarded_record) try: - db.store_citation(app, new_citation_change, new_citation_change.content_type, raw_metadata, parsed_metadata, status = 'REGISTERED') + db.store_citation(app, new_citation_change, raw_content, new_citation_change.content_type, raw_metadata, parsed_metadata, status = 'REGISTERED') #mark old citation as SANITIZED db.mark_citation_as_sanitized(app, cite, previously_discarded_record['content']) except Exception as e: diff --git a/alembic/versions/8e83568feb1d_add_sanitized_value.py b/alembic/versions/8e83568feb1d_add_sanitized_value.py index 3d97132..394c569 100644 --- a/alembic/versions/8e83568feb1d_add_sanitized_value.py +++ b/alembic/versions/8e83568feb1d_add_sanitized_value.py @@ -30,7 +30,8 @@ def upgrade(): op.execute("ALTER TYPE target_status_type ADD VALUE 'SANITIZED'") op.execute("ALTER TYPE citation_status_type ADD VALUE 'SANITIZED'") - + op.add_column('citation', sa.Column('raw_content', sa.Text(), nullable=True)) + op.add_column('citation_version', sa.Column('raw_content', sa.Text(), nullable=True)) def downgrade(): #Move expanded status types to old @@ -57,6 +58,8 @@ def pgsql_change_type(table_name, column_name, new_enum): op.execute(pgsql_change_type('citation_target_version', 'status', 'target_status_type')) op.execute(pgsql_change_type('citation', 'status', 'citation_status_type')) op.execute(pgsql_change_type('citation_version', 'status', 'citation_status_type')) + op.drop_column('citation', sa.Column('raw_content', sa.Text(), nullable=True)) + op.drop_column('citation_version', sa.Column('raw_content', sa.Text(), nullable=True)) #DROP old (SANITIZED) ENUM types From 9d0b96c851bf4a84c1118189e5a282f2effdc08d Mon Sep 17 00:00:00 2001 From: tjacovich Date: Thu, 25 Aug 2022 14:19:21 -0400 Subject: [PATCH 29/31] Modified alembic upgrade to populate raw_content. Modified tasks to sanitize new dois and to only mark registered citations as sanitized. Added sanitized targets to db in order to facilitate downgrade. Modified reevaluate to handle sanitizing records in a way consistent with new citation processing. --- ADSCitationCapture/db.py | 19 +++--- ADSCitationCapture/tasks.py | 65 +++++++++++++++++-- .../8e83568feb1d_add_sanitized_value.py | 17 ++++- 3 files changed, 82 insertions(+), 19 deletions(-) diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index 30e03b9..d2c8c31 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -419,21 +419,18 @@ def citation_data_to_citation_change(citation_data, previously_discarded_record) return citation_change -def update_citation_content(app, citation_change, old_content): +def update_citation_content(app, citation_change, raw_content): """ Update citation record information """ updated = False with app.session_scope() as session: - citation = session.query(Citation).with_for_update().filter_by(citing=citation_change.citing, content=old_content).first() - #change_timestamp = citation_change.timestamp.ToDatetime().replace(tzinfo=tzutc()) # Consider it as UTC to be able to compare it + citation = session.query(Citation).with_for_update().filter_by(citing=citation_change.citing, content=raw_content).first() if citation: if citation.timestamp < citation_change.timestamp: #citation.citing = citation_change.citing # This should not change - citation.content = citation_change.content # This should not change except in a very specific circumstance related to sanitizing dois - citation.cited = citation_change.cited - citation.resolved = citation_change.resolved - citation.timestamp = citation_change.timestamp + citation.raw_content = raw_content + citation.content = citation_change.content session.add(citation) session.commit() updated = True @@ -479,15 +476,17 @@ def mark_all_discarded_citations_as_registered(app, content): session.add(citation) session.commit() -def mark_citation_as_sanitized(app, citing, content): +def mark_sanitized_citation(app, citing, content, raw_content, status='SANITIZED'): """ Update status to SANITIZED for a single discarded citation """ marked_as_registered = False previous_status = None with app.session_scope() as session: - citation = session.query(Citation).with_for_update().filter_by(status='DISCARDED', citing=citing, content=content).first() - citation.status = 'SANITIZED' + citation = session.query(Citation).with_for_update().filter_by(status='DISCARDED', citing=citing, content=raw_content).first() + citation.status = status + citation.content = content + citation.raw_content = raw_content session.add(citation) session.commit() diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index 17964c4..f3f2db0 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -11,6 +11,7 @@ import ADSCitationCapture.api as api import adsmsg import json +import copy # ============================= INITIALIZATION ==================================== # @@ -50,6 +51,8 @@ def task_process_new_citation(citation_change, force=False): content_type = None is_link_alive = False status = "DISCARDED" + raw_status = "DISCARDED" + raw_citation_change = copy.deepcopy(citation_change) if citation_change.content_type == adsmsg.CitationChangeContentType.doi \ and citation_change.content not in ["", None]: #attempts to sanitize the DOI to make it more likely to be valid @@ -58,6 +61,7 @@ def task_process_new_citation(citation_change, force=False): logger.info("Replacing citation_change.content: {} with sanitized version: {}".format(citation_change.content, clean_doi)) raw_content = citation_change.content citation_change.content = clean_doi + elif not clean_doi: logger.warn("Failed to sanitize DOI for {}".format(citation_change.content)) raw_content = citation_change.content @@ -69,6 +73,17 @@ def task_process_new_citation(citation_change, force=False): parsed_metadata = metadata.get('parsed', {}) associated_version_bibcodes = metadata.get('associated', None) + #Do the same for the raw change if different + if raw_content != citation_change.content: + raw_status = 'SANITIZED' + orig_metadata = db.get_citation_target_metadata(app, raw_content) + raw_citation_target_in_db = bool(orig_metadata) # False if dict is empty + orig_raw_metadata = orig_metadata.get('raw', None) + raw_parsed_metadata = orig_metadata.get('parsed', {}) + raw_associated_version_bibcodes = orig_metadata.get('associated', None) + if raw_citation_target_in_db: + raw_status = orig_metadata.get('status', 'SANITIZED') # "REGISTERED" if it is a software record + if citation_target_in_db: status = metadata.get('status', 'DISCARDED') # "REGISTERED" if it is a software record @@ -87,7 +102,17 @@ def task_process_new_citation(citation_change, force=False): if parsed_metadata.get('bibcode') not in (None, "") and is_software: status = "REGISTERED" associated_version_bibcodes = _collect_associated_works(citation_change, parsed_metadata) - + if raw_content != citation_change.content: + if not raw_citation_target_in_db: + # Fetch DOI metadata (if HTTP request fails, an exception is raised + # and the task will be re-queued (see app.py and adsputils)) + orig_raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], raw_citation_change.content) + if orig_raw_metadata: + raw_parsed_metadata = doi.parse_metadata(orig_raw_metadata) + raw_is_software = raw_parsed_metadata.get('doctype', '').lower() == "software" + if raw_parsed_metadata.get('bibcode') not in (None, "") and raw_is_software: + raw_status = "REGISTERED" + raw_associated_version_bibcodes = _collect_associated_works(raw_citation_change, raw_parsed_metadata) #PID elif citation_change.content_type == adsmsg.CitationChangeContentType.pid \ and citation_change.content not in ["", None]: @@ -117,12 +142,16 @@ def task_process_new_citation(citation_change, force=False): #Generates entry for Zenodo citations and notifies web broker if status not in [None, "EMITTABLE"]: if not citation_target_in_db: - # Create citation target in the DB + #Create citation target in the DB target_stored = db.store_citation_target(app, citation_change, content_type, raw_metadata, parsed_metadata, status, associated_version_bibcodes) #If citation target successfully created, update associated records. if target_stored: _update_associated_citation_targets(citation_change, parsed_metadata, associated_version_bibcodes) + if raw_content != citation_change.content and not raw_citation_target_in_db: + #Create raw citation_target (Needed mainly for downgrades to function properly.) + raw_target_stored = db.store_citation_target(app, raw_citation_change, content_type, orig_raw_metadata, raw_parsed_metadata, raw_status, raw_associated_version_bibcodes) + if status == "REGISTERED": #Connects new bibcode to canonical bibcode and DOI if citation_change.content_type == adsmsg.CitationChangeContentType.doi: @@ -159,6 +188,7 @@ def task_process_new_citation(citation_change, force=False): _emit_citation_change(citation_change, parsed_metadata) # Store the citation at the very end, so that if an exception is raised before # this task can be re-run in the future without key collisions in the database + if citation_change.content != raw_content and status == 'REGISTERED': status = 'SANITIZED' stored = db.store_citation(app, citation_change, raw_content, content_type, raw_metadata, parsed_metadata, status) @app.task(queue='process-github-urls', rate_limit=github_api_limit) @@ -962,6 +992,28 @@ def task_maintenance_reevaluate(dois, bibcodes): is_software = parsed_metadata.get('doctype', '').lower() == "software" if not is_software: logger.error("Discarded '%s', it is not 'software'", clean_doi) + if clean_doi != raw_content: + citation_change = adsmsg.CitationChange(content=raw_content, + content_type=getattr(adsmsg.CitationChangeContentType, previously_discarded_record['content_type'].lower()), + status=adsmsg.Status.new, + timestamp=datetime.now() + ) + original_citations = db.get_citations(app, citation_change, status = 'DISCARDED') + logger.debug("Original citations: {}".format(original_citations)) + for cite in original_citations: + logger.debug("Updating content to {} for citing bibcode: {}".format(clean_doi, cite)) + #Fetch full citation object for each citation + citation_data = db.get_citation_data(app, cite, citation_change.content) + citation_data.timestamp = datetime.now() + #replace content + citation_data.content = clean_doi + #store new citation + new_citation_change = db.citation_data_to_citation_change(citation_data, previously_discarded_record) + try: + #mark old citation as SANITIZED + db.mark_sanitized_citation(app, cite, clean_doi, previously_discarded_record['content'], status='DISCARDED') + except Exception as e: + logger.error("Failed to update citation from {} to {} with error {}. Skipping.".format(cite, clean_doi, e)) elif parsed_metadata.get('bibcode') in (None, ""): logger.error("The metadata for '%s' could not be parsed correctly and it did not correctly compute a bibcode", clean_doi) else: @@ -980,12 +1032,12 @@ def task_maintenance_reevaluate(dois, bibcodes): if citation_target_in_db: logger.warn("Sanitized doi: {} already exists in db. Pointing citations to new target.".format(clean_doi)) stored = True - updated = db.update_citation_target_metadata(app, previously_discarded_record['content'], raw_metadata, parsed_metadata, status='SANITIZED') + updated = db.update_citation_target_metadata(app, raw_content, parsed_metadata={}, raw_metadata=None, status='SANITIZED') #Add citation target to database. Update old citation to SANITIZED else: - stored = db.store_citation_target(app, citation_change, previously_discarded_record['content_type'], raw_metadata, parsed_metadata, status='REGISTERED') - updated = db.update_citation_target_metadata(app, previously_discarded_record['content'], raw_metadata, parsed_metadata, status='SANITIZED') + stored = db.store_citation_target(app, citation_change, clean_doi, raw_metadata, parsed_metadata, status='REGISTERED') + updated = db.update_citation_target_metadata(app, previously_discarded_record['content'], parsed_metadata={}, raw_metadata=None, status='SANITIZED') logger.debug("Stored is : {} for citation target {}".format(stored, previously_discarded_record['content'])) #If stored, go through and find all citations to the old doi and point them to the new record. if stored: @@ -1003,9 +1055,8 @@ def task_maintenance_reevaluate(dois, bibcodes): #store new citation new_citation_change = db.citation_data_to_citation_change(citation_data, previously_discarded_record) try: - db.store_citation(app, new_citation_change, raw_content, new_citation_change.content_type, raw_metadata, parsed_metadata, status = 'REGISTERED') #mark old citation as SANITIZED - db.mark_citation_as_sanitized(app, cite, previously_discarded_record['content']) + db.mark_sanitized_citation(app, cite, clean_doi, previously_discarded_record['content']) except Exception as e: logger.error("Failed to update citation from {} to {} with error {}. Skipping.".format(cite, clean_doi, e)) diff --git a/alembic/versions/8e83568feb1d_add_sanitized_value.py b/alembic/versions/8e83568feb1d_add_sanitized_value.py index 394c569..60d1d37 100644 --- a/alembic/versions/8e83568feb1d_add_sanitized_value.py +++ b/alembic/versions/8e83568feb1d_add_sanitized_value.py @@ -16,6 +16,8 @@ branch_labels = None depends_on = None +def psql_copy_column_values(table_name, src_column, dst_column): + return f"UPDATE {table_name} SET {dst_column} = {src_column}" def upgrade(): connection = None @@ -28,10 +30,14 @@ def upgrade(): connection = op.get_bind() connection.execution_options(isolation_level='AUTOCOMMIT') + op.execute("ALTER TYPE target_status_type ADD VALUE 'SANITIZED'") op.execute("ALTER TYPE citation_status_type ADD VALUE 'SANITIZED'") + + #Add raw_citation column op.add_column('citation', sa.Column('raw_content', sa.Text(), nullable=True)) op.add_column('citation_version', sa.Column('raw_content', sa.Text(), nullable=True)) + op.execute(psql_copy_column_values('citation', 'content', 'raw_content')) def downgrade(): #Move expanded status types to old @@ -53,16 +59,23 @@ def pgsql_change_type(table_name, column_name, new_enum): END \ )::{new_enum}" + #Reset to original ENUM type op.execute(pgsql_change_type('citation_target', 'status', 'target_status_type')) op.execute(pgsql_change_type('citation_target_version', 'status', 'target_status_type')) op.execute(pgsql_change_type('citation', 'status', 'citation_status_type')) op.execute(pgsql_change_type('citation_version', 'status', 'citation_status_type')) - op.drop_column('citation', sa.Column('raw_content', sa.Text(), nullable=True)) - op.drop_column('citation_version', sa.Column('raw_content', sa.Text(), nullable=True)) + + #Set content=raw_content so that the content column matches what it would be before this revision + op.execute(psql_copy_column_values('citation', 'raw_content', 'content')) + op.execute(psql_copy_column_values('citation_version', 'raw_content', 'content')) + #Drop the raw_citation_column + op.drop_column('citation','raw_content') + op.drop_column('citation_version','raw_content') #DROP old (SANITIZED) ENUM types op.execute("DROP TYPE target_status_type_old") op.execute("DROP TYPE citation_status_type_old") + \ No newline at end of file From 1015ee224796b3615463681001a82100814a86e9 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Thu, 25 Aug 2022 14:23:49 -0400 Subject: [PATCH 30/31] fixed bug that caused citation capture to try and parse raw data for non-Zenodo records. --- ADSCitationCapture/tasks.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index f3f2db0..223e007 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -73,16 +73,18 @@ def task_process_new_citation(citation_change, force=False): parsed_metadata = metadata.get('parsed', {}) associated_version_bibcodes = metadata.get('associated', None) - #Do the same for the raw change if different - if raw_content != citation_change.content: - raw_status = 'SANITIZED' - orig_metadata = db.get_citation_target_metadata(app, raw_content) - raw_citation_target_in_db = bool(orig_metadata) # False if dict is empty - orig_raw_metadata = orig_metadata.get('raw', None) - raw_parsed_metadata = orig_metadata.get('parsed', {}) - raw_associated_version_bibcodes = orig_metadata.get('associated', None) - if raw_citation_target_in_db: - raw_status = orig_metadata.get('status', 'SANITIZED') # "REGISTERED" if it is a software record + if citation_change.content_type == adsmsg.CitationChangeContentType.doi \ + and citation_change.content not in ["", None]: + #Do the same for the raw change if different + if raw_content != citation_change.content: + raw_status = 'SANITIZED' + orig_metadata = db.get_citation_target_metadata(app, raw_content) + raw_citation_target_in_db = bool(orig_metadata) # False if dict is empty + orig_raw_metadata = orig_metadata.get('raw', None) + raw_parsed_metadata = orig_metadata.get('parsed', {}) + raw_associated_version_bibcodes = orig_metadata.get('associated', None) + if raw_citation_target_in_db: + raw_status = orig_metadata.get('status', 'SANITIZED') # "REGISTERED" if it is a software record if citation_target_in_db: status = metadata.get('status', 'DISCARDED') # "REGISTERED" if it is a software record From 822fd9636f0e00689ea6114e2a949115fefb9d87 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Tue, 20 Sep 2022 08:54:13 -0400 Subject: [PATCH 31/31] Updated test_doi expected output. --- ADSCitationCapture/tests/test_base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ADSCitationCapture/tests/test_base.py b/ADSCitationCapture/tests/test_base.py index 7346608..07d26fd 100644 --- a/ADSCitationCapture/tests/test_base.py +++ b/ADSCitationCapture/tests/test_base.py @@ -297,7 +297,10 @@ def _init_mock_data(self): "10.5281/zenodo.5706396", "10.5281/zenodo.5773480", "10.5281/zenodo.6513224", - "10.5281/zenodo.6982547" + "10.5281/zenodo.6982547", + "10.5281/zenodo.7032947", + "10.5281/zenodo.7032953", + "10.5281/zenodo.7084615" ]}, 'associated': {"Version v2.0.0": "2017zndo....248351D"} }