diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index c33aa41..d2c8c31 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -2,8 +2,8 @@ from psycopg2 import IntegrityError from dateutil.tz import tzutc from ADSCitationCapture.models import Citation, CitationTarget, Event +from adsmsg import CitationChange, CitationChangeContentType from ADSCitationCapture import doi -from adsmsg import CitationChange from adsputils import setup_logging # ============================= INITIALIZATION ==================================== # @@ -18,7 +18,6 @@ level=config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=config.get('LOG_STDOUT', False)) - # =============================== FUNCTIONS ======================================= # def store_event(app, data): """ @@ -73,6 +72,10 @@ def _update_citation_target_metadata_session(session, content, raw_metadata, par raw_metadata = raw_metadata.decode('utf-8') except UnicodeEncodeError: pass + + if status == 'SANITIZED': + #reset status but otherwise leave the citation target alone + citation_target.status = status if citation_target.raw_cited_metadata != raw_metadata or citation_target.parsed_cited_metadata != parsed_metadata or \ (status is not None and citation_target.status != status) or citation_target.curated_metadata != curated_metadata or \ citation_target.bibcode != bibcode or citation_target.associated_works != associated: @@ -122,7 +125,7 @@ def update_citation_target_curator_message(app, content, msg): msg_updated = _update_citation_target_curator_message_session(session, content, msg) return msg_updated -def store_citation(app, citation_change, content_type, raw_metadata, parsed_metadata, status): +def store_citation(app, citation_change, raw_content, content_type, raw_metadata, parsed_metadata, status): """ Stores a new citation in the DB """ @@ -132,6 +135,7 @@ def store_citation(app, citation_change, content_type, raw_metadata, parsed_meta citation.citing = citation_change.citing citation.cited = citation_change.cited citation.content = citation_change.content + citation.raw_content = raw_content citation.resolved = citation_change.resolved citation.timestamp = citation_change.timestamp.ToDatetime().replace(tzinfo=tzutc()) citation.status = status @@ -322,15 +326,31 @@ def get_citations_by_bibcode(app, bibcode): citations = get_citations(app, dummy_citation_change) return citations -def get_citations(app, citation_change): +def get_citations(app, citation_change, status='REGISTERED'): """ Return all the citations (bibcodes) to a given content. - It will ignore DELETED and DISCARDED citations. + It will ignore DELETED and DISCARDED citations by default. """ with app.session_scope() as session: - citation_bibcodes = [r.citing for r in session.query(Citation).filter_by(content=citation_change.content, status="REGISTERED").all()] + citation_bibcodes = [r.citing for r in session.query(Citation).filter_by(content=citation_change.content, status=status).all()] return citation_bibcodes +def get_citation_data(app, citing_bibcode, content): + """ + Get the data for given citation + """ + with app.session_scope() as session: + citation_change = session.query(Citation).filter_by(content=content, citing=citing_bibcode).first() + if citation_change: + citation = Citation() + citation.citing = citation_change.citing + citation.cited = citation_change.cited + citation.content = citation_change.content + citation.resolved = citation_change.resolved + citation.timestamp = citation_change.timestamp + citation.status = citation_change.status + return citation + def generate_modified_metadata(parsed_metadata, curated_entry): """ modify parsed_metadata with any curated metadata. return results. @@ -385,6 +405,43 @@ def update_citation(app, citation_change): logger.info("Ignoring citation update (citing '%s', content '%s' and timestamp '%s') because received timestamp is equal/older than timestamp in database", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString()) return updated +def citation_data_to_citation_change(citation_data, previously_discarded_record): + """ + Takes data from a citation and converts it into a citation_change. + """ + citation_change = CitationChange() + citation_change.content_type = getattr(CitationChangeContentType, previously_discarded_record['content_type'].lower()) + citation_change.content = citation_data.content + citation_change.citing = citation_data.citing + citation_change.cited = citation_data.cited + citation_change.resolved = citation_data.resolved + citation_change.timestamp.FromDatetime(citation_data.timestamp) + + return citation_change + +def update_citation_content(app, citation_change, raw_content): + """ + Update citation record information + """ + updated = False + with app.session_scope() as session: + citation = session.query(Citation).with_for_update().filter_by(citing=citation_change.citing, content=raw_content).first() + if citation: + if citation.timestamp < citation_change.timestamp: + #citation.citing = citation_change.citing # This should not change + citation.raw_content = raw_content + citation.content = citation_change.content + session.add(citation) + session.commit() + updated = True + logger.info("Updated citation (citing '%s', content '%s' and timestamp '%s')", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString()) + else: + logger.info("Ignoring citation update (citing '%s', content '%s' and timestamp '%s') because received timestamp is equal/older than timestamp in database", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString()) + else: + logger.info("Unable to update citation (citing '%s', content '%s' and timestamp '%s')", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString()) + + return updated + def mark_citation_as_deleted(app, citation_change): """ Update status to DELETED for a given citation @@ -419,6 +476,46 @@ def mark_all_discarded_citations_as_registered(app, content): session.add(citation) session.commit() +def mark_sanitized_citation(app, citing, content, raw_content, status='SANITIZED'): + """ + Update status to SANITIZED for a single discarded citation + """ + marked_as_registered = False + previous_status = None + with app.session_scope() as session: + citation = session.query(Citation).with_for_update().filter_by(status='DISCARDED', citing=citing, content=raw_content).first() + citation.status = status + citation.content = content + citation.raw_content = raw_content + session.add(citation) + session.commit() + +def mark_all_discarded_citations_as_sanitized(app, content): + """ + Update status to SANITIZED for all discarded citations of a given content + """ + marked_as_registered = False + previous_status = None + with app.session_scope() as session: + citations = session.query(Citation).with_for_update().filter_by(status='DISCARDED', content=content).all() + for citation in citations: + citation.status = 'SANITIZED' + session.add(citation) + session.commit() + +def mark_all_sanitized_citations_as_discarded(app, content): + """ + Update status to DISCARDED for all sanitized citations of a given content + """ + marked_as_registered = False + previous_status = None + with app.session_scope() as session: + citations = session.query(Citation).with_for_update().filter_by(status='SANITIZED', content=content).all() + for citation in citations: + citation.status = 'SANITIZED' + session.add(citation) + session.commit() + def populate_bibcode_column(main_session): """ Pulls all citation targets from DB and populates the bibcode column using parsed metadata diff --git a/ADSCitationCapture/delta_computation.py b/ADSCitationCapture/delta_computation.py index 172c933..f4b3f3f 100644 --- a/ADSCitationCapture/delta_computation.py +++ b/ADSCitationCapture/delta_computation.py @@ -191,8 +191,10 @@ def _reconstruct_previous_expanded_raw_data(self): # Reconstruct expanded raw table from the official citation table drop_reconstructed_previous_expanded_table = "DROP TABLE IF EXISTS {0}.{1};" self._execute_sql(drop_reconstructed_previous_expanded_table, self.previous_schema_name, self.recreated_previous_expanded_table_name) - reconstruct_previous_expanded_table = "CREATE TABLE {0}.{1} AS SELECT id, citing, cited, CASE WHEN citation_target.content_type = 'DOI' THEN true ELSE false END AS doi, CASE WHEN citation_target.content_type = 'PID' THEN true ELSE false END AS pid, CASE WHEN citation_target.content_type = 'URL' THEN true ELSE false END AS url, citation.content, citation.resolved, citation.timestamp FROM citation INNER JOIN citation_target ON citation.content = citation_target.content WHERE citation.status != 'DELETED';" + reconstruct_previous_expanded_table = "CREATE TABLE {0}.{1} AS SELECT id, citing, cited, CASE WHEN citation_target.content_type = 'DOI' THEN true ELSE false END AS doi, CASE WHEN citation_target.content_type = 'PID' THEN true ELSE false END AS pid, CASE WHEN citation_target.content_type = 'URL' THEN true ELSE false END AS url, citation.raw_content, citation.resolved, citation.timestamp FROM citation INNER JOIN citation_target ON citation.content = citation_target.content WHERE citation.status != 'DELETED';" self._execute_sql(reconstruct_previous_expanded_table, self.previous_schema_name, self.recreated_previous_expanded_table_name) + rename_raw_content_column_previous_expanded_table = "ALTER TABLE {0}.{1} RENAME COLUMN raw_content TO content" + self._execute_sql(rename_raw_content_column_previous_expanded_table, self.previous_schema_name, self.recreated_previous_expanded_table_name) def _find_not_processed_records_from_previous_run(self): """ diff --git a/ADSCitationCapture/doi.py b/ADSCitationCapture/doi.py index 660efe7..d809630 100644 --- a/ADSCitationCapture/doi.py +++ b/ADSCitationCapture/doi.py @@ -176,6 +176,30 @@ def parse_metadata(raw_metadata): """ return _parse_metadata_zenodo_doi(raw_metadata) +def sanitize_zenodo_doi(doi): + """ + Takes the imported citation_change content and tries to sanitize it if it is a zenodo doi. + """ + return _sanitize_zendo_doi(doi) + +def _sanitize_zendo_doi(doi): + doi_root = '10.5281' + zenodo_doi_reset = re.compile(r"10.\d{4,9}/zenodo\.([0-9]*)", re.IGNORECASE) + zenodo_doi_reset_slash = re.compile(r"10.\d{4,9}/zenodo/([0-9]*)", re.IGNORECASE) + try: + #splits apart any conjoined dois and takes the first full one. + spl_doi = doi_root + doi.split(doi_root)[1] + return re.search(zenodo_doi_reset, spl_doi).group(0) + except Exception as e: + logger.error("Attempt to parse content: {} failed with error: {}. Trying again with alternate regex.".format(doi, e)) + try: + spl_doi = doi_root + doi.split(doi_root)[1] + split = re.search(zenodo_doi_reset_slash, spl_doi).group(0).split('/') + return doi_root + "/" + "zenodo." + split[2] + except Exception as e: + logger.error("Attempt to parse content: {} failed with error: {}.".format(doi, e)) + return None + def renormalize_author_names(authors): """ A wrapper function dc.author_names._normalize that allows CitationCapture diff --git a/ADSCitationCapture/models.py b/ADSCitationCapture/models.py index 46a67d1..cf42f2b 100644 --- a/ADSCitationCapture/models.py +++ b/ADSCitationCapture/models.py @@ -13,8 +13,8 @@ citation_content_type = ENUM('DOI', 'PID', 'URL', name='citation_content_type') citation_change_type = ENUM('NEW', 'DELETED', 'UPDATED', name='citation_change_type') -citation_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', name='citation_status_type') -target_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', name='target_status_type') +citation_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', 'SANITIZED', name='citation_status_type') +target_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', 'SANITIZED', name='target_status_type') class RawCitation(Base): @@ -53,6 +53,7 @@ class Citation(Base): ) __versioned__ = {} # Must be added to all models that are to be versioned id = Column(Integer, primary_key=True) + raw_content = Column(Text()) content = Column(Text(), ForeignKey('public.citation_target.content')) citing = Column(Text()) # Bibcode of the article that is citing a target cited = Column(Text()) # Probably not necessary to keep diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index b752301..223e007 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -11,6 +11,7 @@ import ADSCitationCapture.api as api import adsmsg import json +import copy # ============================= INITIALIZATION ==================================== # @@ -50,6 +51,20 @@ def task_process_new_citation(citation_change, force=False): content_type = None is_link_alive = False status = "DISCARDED" + raw_status = "DISCARDED" + raw_citation_change = copy.deepcopy(citation_change) + if citation_change.content_type == adsmsg.CitationChangeContentType.doi \ + and citation_change.content not in ["", None]: + #attempts to sanitize the DOI to make it more likely to be valid + clean_doi = doi.sanitize_zenodo_doi(citation_change.content) + if clean_doi: + logger.info("Replacing citation_change.content: {} with sanitized version: {}".format(citation_change.content, clean_doi)) + raw_content = citation_change.content + citation_change.content = clean_doi + + elif not clean_doi: + logger.warn("Failed to sanitize DOI for {}".format(citation_change.content)) + raw_content = citation_change.content # Check if we already have the citation target in the DB metadata = db.get_citation_target_metadata(app, citation_change.content) @@ -58,6 +73,19 @@ def task_process_new_citation(citation_change, force=False): parsed_metadata = metadata.get('parsed', {}) associated_version_bibcodes = metadata.get('associated', None) + if citation_change.content_type == adsmsg.CitationChangeContentType.doi \ + and citation_change.content not in ["", None]: + #Do the same for the raw change if different + if raw_content != citation_change.content: + raw_status = 'SANITIZED' + orig_metadata = db.get_citation_target_metadata(app, raw_content) + raw_citation_target_in_db = bool(orig_metadata) # False if dict is empty + orig_raw_metadata = orig_metadata.get('raw', None) + raw_parsed_metadata = orig_metadata.get('parsed', {}) + raw_associated_version_bibcodes = orig_metadata.get('associated', None) + if raw_citation_target_in_db: + raw_status = orig_metadata.get('status', 'SANITIZED') # "REGISTERED" if it is a software record + if citation_target_in_db: status = metadata.get('status', 'DISCARDED') # "REGISTERED" if it is a software record @@ -76,7 +104,17 @@ def task_process_new_citation(citation_change, force=False): if parsed_metadata.get('bibcode') not in (None, "") and is_software: status = "REGISTERED" associated_version_bibcodes = _collect_associated_works(citation_change, parsed_metadata) - + if raw_content != citation_change.content: + if not raw_citation_target_in_db: + # Fetch DOI metadata (if HTTP request fails, an exception is raised + # and the task will be re-queued (see app.py and adsputils)) + orig_raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], raw_citation_change.content) + if orig_raw_metadata: + raw_parsed_metadata = doi.parse_metadata(orig_raw_metadata) + raw_is_software = raw_parsed_metadata.get('doctype', '').lower() == "software" + if raw_parsed_metadata.get('bibcode') not in (None, "") and raw_is_software: + raw_status = "REGISTERED" + raw_associated_version_bibcodes = _collect_associated_works(raw_citation_change, raw_parsed_metadata) #PID elif citation_change.content_type == adsmsg.CitationChangeContentType.pid \ and citation_change.content not in ["", None]: @@ -106,12 +144,16 @@ def task_process_new_citation(citation_change, force=False): #Generates entry for Zenodo citations and notifies web broker if status not in [None, "EMITTABLE"]: if not citation_target_in_db: - # Create citation target in the DB + #Create citation target in the DB target_stored = db.store_citation_target(app, citation_change, content_type, raw_metadata, parsed_metadata, status, associated_version_bibcodes) #If citation target successfully created, update associated records. if target_stored: _update_associated_citation_targets(citation_change, parsed_metadata, associated_version_bibcodes) + if raw_content != citation_change.content and not raw_citation_target_in_db: + #Create raw citation_target (Needed mainly for downgrades to function properly.) + raw_target_stored = db.store_citation_target(app, raw_citation_change, content_type, orig_raw_metadata, raw_parsed_metadata, raw_status, raw_associated_version_bibcodes) + if status == "REGISTERED": #Connects new bibcode to canonical bibcode and DOI if citation_change.content_type == adsmsg.CitationChangeContentType.doi: @@ -148,7 +190,8 @@ def task_process_new_citation(citation_change, force=False): _emit_citation_change(citation_change, parsed_metadata) # Store the citation at the very end, so that if an exception is raised before # this task can be re-run in the future without key collisions in the database - stored = db.store_citation(app, citation_change, content_type, raw_metadata, parsed_metadata, status) + if citation_change.content != raw_content and status == 'REGISTERED': status = 'SANITIZED' + stored = db.store_citation(app, citation_change, raw_content, content_type, raw_metadata, parsed_metadata, status) @app.task(queue='process-github-urls', rate_limit=github_api_limit) def task_process_github_urls(citation_change, metadata): @@ -190,7 +233,7 @@ def task_process_github_urls(citation_change, metadata): _emit_citation_change(citation_change, parsed_metadata) # Store the citation at the very end, so that if an exception is raised before # this task can be re-run in the future without key collisions in the database - stored = db.store_citation(app, citation_change, content_type, raw_metadata, parsed_metadata, status) + stored = db.store_citation(app, citation_change, citation_change.content, content_type, raw_metadata, parsed_metadata, status) @app.task(queue='process-updated-citation') def task_process_updated_citation(citation_change, force=False): @@ -933,21 +976,100 @@ def task_maintenance_reevaluate(dois, bibcodes): # Fetch DOI metadata (if HTTP request fails, an exception is raised # and the task will be re-queued (see app.py and adsputils)) if previously_discarded_record['content_type'] == 'DOI': - raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], previously_discarded_record['content']) + #Try and sanitize the DOI before reevaluating + clean_doi = doi.sanitize_zenodo_doi(previously_discarded_record.get('content')) + if clean_doi: + if clean_doi != previously_discarded_record.get('content'): + raw_content = previously_discarded_record.get('content') + logger.info("Replacing citation_change.content: {} with sanitized version: {}".format(previously_discarded_record.get('content'), clean_doi)) + else: + logger.warn("Failed to sanitize DOI for {}".format(previously_discarded_record.get('content'))) + clean_doi = previously_discarded_record.get('content') + raw_content = previously_discarded_record.get('content') + + #Fetch metadata and process + raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], clean_doi) if raw_metadata: parsed_metadata = doi.parse_metadata(raw_metadata) is_software = parsed_metadata.get('doctype', '').lower() == "software" if not is_software: - logger.error("Discarded '%s', it is not 'software'", previously_discarded_record['content']) + logger.error("Discarded '%s', it is not 'software'", clean_doi) + if clean_doi != raw_content: + citation_change = adsmsg.CitationChange(content=raw_content, + content_type=getattr(adsmsg.CitationChangeContentType, previously_discarded_record['content_type'].lower()), + status=adsmsg.Status.new, + timestamp=datetime.now() + ) + original_citations = db.get_citations(app, citation_change, status = 'DISCARDED') + logger.debug("Original citations: {}".format(original_citations)) + for cite in original_citations: + logger.debug("Updating content to {} for citing bibcode: {}".format(clean_doi, cite)) + #Fetch full citation object for each citation + citation_data = db.get_citation_data(app, cite, citation_change.content) + citation_data.timestamp = datetime.now() + #replace content + citation_data.content = clean_doi + #store new citation + new_citation_change = db.citation_data_to_citation_change(citation_data, previously_discarded_record) + try: + #mark old citation as SANITIZED + db.mark_sanitized_citation(app, cite, clean_doi, previously_discarded_record['content'], status='DISCARDED') + except Exception as e: + logger.error("Failed to update citation from {} to {} with error {}. Skipping.".format(cite, clean_doi, e)) elif parsed_metadata.get('bibcode') in (None, ""): - logger.error("The metadata for '%s' could not be parsed correctly and it did not correctly compute a bibcode", previously_discarded_record['content']) + logger.error("The metadata for '%s' could not be parsed correctly and it did not correctly compute a bibcode", clean_doi) else: # Create citation target in the DB - updated = db.update_citation_target_metadata(app, previously_discarded_record['content'], raw_metadata, parsed_metadata, status='REGISTERED') - if updated: - db.mark_all_discarded_citations_as_registered(app, previously_discarded_record['content']) + #If the DOI has been sanitized, CC needs to update the content of each citation + if clean_doi != previously_discarded_record.get('content'): + #check to make sure clean doi doesn't already exist + metadata = db.get_citation_target_metadata(app, clean_doi) + citation_target_in_db = bool(metadata) + citation_change = adsmsg.CitationChange(content=clean_doi, + content_type=getattr(adsmsg.CitationChangeContentType, previously_discarded_record['content_type'].lower()), + status=adsmsg.Status.new, + timestamp=datetime.now() + ) + #Check if sanitized record is already in db + if citation_target_in_db: + logger.warn("Sanitized doi: {} already exists in db. Pointing citations to new target.".format(clean_doi)) + stored = True + updated = db.update_citation_target_metadata(app, raw_content, parsed_metadata={}, raw_metadata=None, status='SANITIZED') + + #Add citation target to database. Update old citation to SANITIZED + else: + stored = db.store_citation_target(app, citation_change, clean_doi, raw_metadata, parsed_metadata, status='REGISTERED') + updated = db.update_citation_target_metadata(app, previously_discarded_record['content'], parsed_metadata={}, raw_metadata=None, status='SANITIZED') + logger.debug("Stored is : {} for citation target {}".format(stored, previously_discarded_record['content'])) + #If stored, go through and find all citations to the old doi and point them to the new record. + if stored: + #Get all DISCARDED citations + citation_change.content = previously_discarded_record.get('content', '') + original_citations = db.get_citations(app, citation_change, status = 'DISCARDED') + logger.debug("Original citations: {}".format(original_citations)) + for cite in original_citations: + logger.debug("Updating content to {} for citing bibcode: {}".format(clean_doi, cite)) + #Fetch full citation object for each citation + citation_data = db.get_citation_data(app, cite, citation_change.content) + citation_data.timestamp = datetime.now() + #replace content + citation_data.content = clean_doi + #store new citation + new_citation_change = db.citation_data_to_citation_change(citation_data, previously_discarded_record) + try: + #mark old citation as SANITIZED + db.mark_sanitized_citation(app, cite, clean_doi, previously_discarded_record['content']) + except Exception as e: + logger.error("Failed to update citation from {} to {} with error {}. Skipping.".format(cite, clean_doi, e)) + + + #Update the citation target if the content hasn't changed. + else: + updated = db.update_citation_target_metadata(app, clean_doi, raw_metadata, parsed_metadata, status='REGISTERED') + + #If there are updates to records, send the updates to the master TODO: Should we emit events as well? if updated: - citation_change = adsmsg.CitationChange(content=previously_discarded_record['content'], + citation_change = adsmsg.CitationChange(content=clean_doi, content_type=getattr(adsmsg.CitationChangeContentType, previously_discarded_record['content_type'].lower()), status=adsmsg.Status.new, timestamp=datetime.now() diff --git a/ADSCitationCapture/tests/test_base.py b/ADSCitationCapture/tests/test_base.py index 901dfd6..07d26fd 100644 --- a/ADSCitationCapture/tests/test_base.py +++ b/ADSCitationCapture/tests/test_base.py @@ -296,7 +296,11 @@ def _init_mock_data(self): "10.5281/zenodo.5545068", "10.5281/zenodo.5706396", "10.5281/zenodo.5773480", - "10.5281/zenodo.6513224" + "10.5281/zenodo.6513224", + "10.5281/zenodo.6982547", + "10.5281/zenodo.7032947", + "10.5281/zenodo.7032953", + "10.5281/zenodo.7084615" ]}, 'associated': {"Version v2.0.0": "2017zndo....248351D"} } diff --git a/ADSCitationCapture/tests/test_doi.py b/ADSCitationCapture/tests/test_doi.py index 089da67..3cd7cd0 100644 --- a/ADSCitationCapture/tests/test_doi.py +++ b/ADSCitationCapture/tests/test_doi.py @@ -30,6 +30,21 @@ def test_software_doi(self): httpretty.disable() httpretty.reset() # clean up registered urls and request history + def test_sanitize_doi_trailing_characters(self): + unsanitized_id = "10.5281/zenodo.11020__amp__quot;__amp__gt" # software + doi_id = "10.5281/zenodo.11020" # software + self.assertEqual(doi.sanitize_zenodo_doi(unsanitized_id), doi_id) + + def test_sanitize_doi_conjoined_dois(self): + unsanitized_id = "10.5281/zenodo.1102010.5281/zenodo.11020" # software + doi_id = "10.5281/zenodo.11020" # software + self.assertEqual(doi.sanitize_zenodo_doi(unsanitized_id), doi_id) + + def test_sanitize_doi_slash_in_dois(self): + unsanitized_id = "10.5281/zenodo/11020" # software + doi_id = "10.5281/zenodo.11020" # software + self.assertEqual(doi.sanitize_zenodo_doi(unsanitized_id), doi_id) + def test_non_software_doi(self): doi_id = "10.1016/j.dsr2.2008.10.030" # Not software expected_response_content = '' diff --git a/ADSCitationCapture/tests/test_tasks.py b/ADSCitationCapture/tests/test_tasks.py index d95d031..f24d75d 100644 --- a/ADSCitationCapture/tests/test_tasks.py +++ b/ADSCitationCapture/tests/test_tasks.py @@ -31,6 +31,17 @@ def _common_citation_changes_doi(self, status): citation_change.resolved = False citation_change.status = status return citation_changes + + def _common_citation_changes_unsanitized_doi(self, status): + citation_changes = adsmsg.CitationChanges() + citation_change = citation_changes.changes.add() + citation_change.citing = '2005CaJES..42.1987P' + citation_change.cited = '...................' + citation_change.content = '10.5281/zenodo.11020__amp__quot;__amp__gt' + citation_change.content_type = adsmsg.CitationChangeContentType.doi + citation_change.resolved = False + citation_change.status = status + return citation_changes def test_process_new_citation_changes_doi(self): citation_changes = self._common_citation_changes_doi(adsmsg.Status.new) @@ -96,6 +107,71 @@ def test_process_new_citation_changes_doi(self): self.assertTrue(mocked['webhook_dump_event'].called) self.assertTrue(mocked['webhook_emit_event'].called) + def test_process_new_citation_changes_sanitized_doi(self): + citation_changes = self._common_citation_changes_unsanitized_doi(adsmsg.Status.new) + doi_id = "10.5281/zenodo.11020" #Software + with TestBase.mock_multiple_targets({ + 'sanitize_zenodo_doi': patch.object(doi, 'sanitize_zenodo_doi', wraps=doi.sanitize_zenodo_doi), \ + 'citation_already_exists': patch.object(db, 'citation_already_exists', return_value=False), \ + 'get_citation_target_metadata': patch.object(db, 'get_citation_target_metadata', return_value={}), \ + 'get_citations_by_bibcode': patch.object(db, 'get_citations_by_bibcode', return_value=[]), \ + 'store_citation_target': patch.object(db, 'store_citation_target', return_value=True), \ + 'store_citation': patch.object(db, 'store_citation', return_value=True), \ + 'store_event': patch.object(db, 'store_event', return_value=True), \ + 'update_citation': patch.object(db, 'update_citation', return_value=True), \ + 'mark_citation_as_deleted': patch.object(db, 'mark_citation_as_deleted', return_value=(True, 'REGISTERED')), \ + 'get_citations': patch.object(db, 'get_citations', return_value=[]), \ + 'update_citation_target_metadata': patch.object(db, 'update_citation_target_metadata', return_value=True), \ + 'get_citation_target_count': patch.object(db, 'get_citation_target_count', return_value=0), \ + 'get_citation_count': patch.object(db, 'get_citation_count', return_value=0), \ + 'get_citation_targets_by_bibcode': patch.object(db, 'get_citation_targets_by_bibcode', return_value=[]), \ + 'get_citation_targets_by_doi': patch.object(db, 'get_citation_targets_by_doi', return_value=[]), \ + 'get_citation_targets': patch.object(db, 'get_citation_targets', return_value=[]), \ + 'get_canonical_bibcode': patch.object(api, 'get_canonical_bibcode', return_value=citation_changes.changes[0].citing), \ + 'get_canonical_bibcodes': patch.object(api, 'get_canonical_bibcodes', return_value=[]), \ + 'request_existing_citations': patch.object(api, 'request_existing_citations', return_value=[]), \ + 'fetch_metadata': patch.object(doi, 'fetch_metadata', return_value=self.mock_data[doi_id]['raw']), \ + 'parse_metadata': patch.object(doi, 'parse_metadata', return_value=self.mock_data[doi_id]['parsed']), \ + 'build_bibcode': patch.object(doi, 'build_bibcode', wraps=doi.build_bibcode), \ + 'url_is_alive': patch.object(url, 'is_alive', return_value=True), \ + 'is_url': patch.object(url, 'is_url', wraps=url.is_url), \ + 'citation_change_to_event_data': patch.object(webhook, 'citation_change_to_event_data', wraps=webhook.citation_change_to_event_data), \ + 'identical_bibcodes_event_data': patch.object(webhook, 'identical_bibcodes_event_data', wraps=webhook.identical_bibcodes_event_data), \ + 'identical_bibcode_and_doi_event_data': patch.object(webhook, 'identical_bibcode_and_doi_event_data', wraps=webhook.identical_bibcode_and_doi_event_data), \ + 'webhook_dump_event': patch.object(webhook, 'dump_event', return_value=True), \ + 'webhook_emit_event': patch.object(webhook, 'emit_event', return_value=True), \ + 'forward_message': patch.object(app.ADSCitationCaptureCelery, 'forward_message', return_value=True)}) as mocked: + tasks.task_process_citation_changes(citation_changes) + self.assertTrue(mocked['sanitize_zenodo_doi'].called) + self.assertTrue(mocked['citation_already_exists'].called) + self.assertTrue(mocked['get_citation_target_metadata'].called) + self.assertTrue(mocked['fetch_metadata'].called) + self.assertTrue(mocked['parse_metadata'].called) + self.assertFalse(mocked['url_is_alive'].called) + self.assertTrue(mocked['get_canonical_bibcode'].called) + self.assertTrue(mocked['get_canonical_bibcodes'].called) + self.assertTrue(mocked['get_citations_by_bibcode'].called) + self.assertTrue(mocked['store_citation_target'].called) + self.assertTrue(mocked['store_citation'].called) + self.assertFalse(mocked['update_citation'].called) + self.assertFalse(mocked['mark_citation_as_deleted'].called) + self.assertFalse(mocked['get_citations'].called) + self.assertTrue(mocked['forward_message'].called) + self.assertFalse(mocked['update_citation_target_metadata'].called) + self.assertFalse(mocked['get_citation_target_count'].called) + self.assertFalse(mocked['get_citation_count'].called) + self.assertFalse(mocked['get_citation_targets_by_bibcode'].called) + self.assertFalse(mocked['get_citation_targets_by_doi'].called) + self.assertFalse(mocked['get_citation_targets'].called) + self.assertFalse(mocked['request_existing_citations'].called) + self.assertFalse(mocked['build_bibcode'].called) + self.assertFalse(mocked['is_url'].called) + self.assertTrue(mocked['citation_change_to_event_data'].called) + self.assertFalse(mocked['identical_bibcodes_event_data'].called) + self.assertTrue(mocked['identical_bibcode_and_doi_event_data'].called) + self.assertTrue(mocked['store_event'].called) + self.assertTrue(mocked['webhook_dump_event'].called) + self.assertTrue(mocked['webhook_emit_event'].called) def test_process_updated_citation_changes_doi(self): citation_changes = self._common_citation_changes_doi(adsmsg.Status.updated) diff --git a/README.md b/README.md index 7ef9d23..5548540 100644 --- a/README.md +++ b/README.md @@ -518,13 +518,17 @@ Currently only sends newly registered records to Master. # Curating based on JSON from a command line argument by bibcode. python3 run.py MAINTENANCE --curation --bibcode "2021zndo...5659382R" --json '{"abstract": "Analysis software for COS observations of PG quasars from QUEST sample: Veilleux et al. 2022, ApJ, 926, 60."}' # Curating based on JSON from a command line argument by DOI. +<<<<<<< HEAD + python3 run.py MAINTENANCE --curation --doi "10.5281/zenodo.5659382" --json '{"abstract": "Analysis software for COS observations of PG quasars from QUEST sample: Veilleux et al. 2022, ApJ, 926, 60."}' +======= python3 run.py MAINTENANCE --curation --doi "10.5281/zenodo.5659382" --json '{"abstract": "Analysis software for COS observations of PG quasars from QUEST sample: Veilleux et al. 2022, ApJ, 926, 60."}' +>>>>>>> master # Clear curated_metadata for a given entry by bibcode python3 run.py MAINTENANCE --curation --bibcode "YYYYzndo...BCDEFGR" --reset # Clear curated_metadata for a given entry by doi python3 run.py MAINTENANCE --curation --doi "10.XYZA/zenodo.BCDEFG" --reset # Clear curated_metadata by file -python3 run.py MAINTENANCE --curation --input_filename $/path/to/input_file --reset +python3 run.py MAINTENANCE --curation --input_filename /path/to/input_file --reset # Display current metadata for a given entry by doi as standard output python3 run.py MAINTENANCE --curation --doi "10.XYZA/zenodo.BCDEFG" --show # Display current metadata for a given entry by bibcode as standard output diff --git a/alembic/versions/0b6a01b03d4d_urls.py b/alembic/versions/0b6a01b03d4d_urls.py index a85215f..875bd00 100644 --- a/alembic/versions/0b6a01b03d4d_urls.py +++ b/alembic/versions/0b6a01b03d4d_urls.py @@ -25,25 +25,32 @@ def upgrade(): def downgrade(): #Move expanded status types to old op.execute("ALTER TYPE target_status_type RENAME TO target_status_type_old") - op.execute("CREATE TYPE target_status_type AS ENUM('REGISTERED', 'DELETED','UPDATED')") + op.execute("CREATE TYPE target_status_type AS ENUM('REGISTERED','DISCARDED','DELETED','UPDATED')") #instantiate original status types op.execute("ALTER TYPE citation_status_type RENAME TO citation_status_type_old") - op.execute("CREATE TYPE citation_status_type AS ENUM('REGISTERED', 'DELETED','UPDATED')") + op.execute("CREATE TYPE citation_status_type AS ENUM('REGISTERED','DISCARDED','DELETED','UPDATED')") - #DROP expanded status columns - op.drop_column('citation_target_version','status') - op.drop_column('citation_target','status') - op.drop_column('citation_version','status') - op.drop_column('citation','status') + + def pgsql_change_type(table_name, column_name, new_enum): + return f"ALTER TABLE {table_name} \ + ALTER COLUMN {column_name} \ + SET DATA TYPE {new_enum} \ + USING ( \ + CASE {column_name}::text \ + WHEN 'EMITTABLE' THEN 'NULL' \ + ELSE {column_name}::text \ + END \ + )::{new_enum}" + + #ALTER column types to original ENUM type. + op.execute(pgsql_change_type('citation_target', 'status', 'target_status_type')) + op.execute(pgsql_change_type('citation_target_version', 'status', 'target_status_type')) + op.execute(pgsql_change_type('citation', 'status', 'citation_status_type')) + op.execute(pgsql_change_type('citation_version', 'status', 'citation_status_type')) + #DROP old ENUM types op.execute("DROP TYPE target_status_type_old") op.execute("DROP TYPE citation_status_type_old") - - #ADD original status columns - op.add_column('citation_target',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', name='target_status_type'), nullable=True)) - op.add_column('citation_target_version',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', name='target_status_type'), nullable=True)) - op.add_column('citation_version',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', name='citation_status_type'), nullable=True)) - op.add_column('citation',sa.Column('status', postgresql.ENUM('REGISTERED', 'DELETED', 'DISCARDED', name='citation_status_type'), nullable=True)) - + \ No newline at end of file diff --git a/alembic/versions/8e83568feb1d_add_sanitized_value.py b/alembic/versions/8e83568feb1d_add_sanitized_value.py new file mode 100644 index 0000000..60d1d37 --- /dev/null +++ b/alembic/versions/8e83568feb1d_add_sanitized_value.py @@ -0,0 +1,81 @@ +"""add_sanitized_value + +Revision ID: 8e83568feb1d +Revises: 7021071e5e63 +Create Date: 2022-04-14 09:24:42.277371 + +""" +from alembic import op +import sqlalchemy as sa +from ADSCitationCapture import db +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '8e83568feb1d' +down_revision = '7021071e5e63' +branch_labels = None +depends_on = None + +def psql_copy_column_values(table_name, src_column, dst_column): + return f"UPDATE {table_name} SET {dst_column} = {src_column}" + +def upgrade(): + connection = None + if not op.get_context().as_sql: + """ + ALTERING ENUM values cannot be done from transaction blocks. + Changing the isolation_level to autocommit puts the specific calls in individually, preventing the issue. + Alembic will warn you about changing isolation level because it has already opened a transaction which is committed. + """ + connection = op.get_bind() + connection.execution_options(isolation_level='AUTOCOMMIT') + + + op.execute("ALTER TYPE target_status_type ADD VALUE 'SANITIZED'") + op.execute("ALTER TYPE citation_status_type ADD VALUE 'SANITIZED'") + + #Add raw_citation column + op.add_column('citation', sa.Column('raw_content', sa.Text(), nullable=True)) + op.add_column('citation_version', sa.Column('raw_content', sa.Text(), nullable=True)) + op.execute(psql_copy_column_values('citation', 'content', 'raw_content')) + +def downgrade(): +#Move expanded status types to old + op.execute("ALTER TYPE target_status_type RENAME TO target_status_type_old") + op.execute("CREATE TYPE target_status_type AS ENUM('REGISTERED', 'DELETED', 'DISCARDED', 'UPDATED', 'EMITTABLE')") + + #instantiate original status types + op.execute("ALTER TYPE citation_status_type RENAME TO citation_status_type_old") + op.execute("CREATE TYPE citation_status_type AS ENUM('REGISTERED', 'DELETED', 'DISCARDED', 'UPDATED', 'EMITTABLE')") + + def pgsql_change_type(table_name, column_name, new_enum): + return f"ALTER TABLE {table_name} \ + ALTER COLUMN {column_name} \ + SET DATA TYPE {new_enum} \ + USING ( \ + CASE {column_name}::text \ + WHEN 'SANITIZED' THEN 'DISCARDED' \ + ELSE {column_name}::text \ + END \ + )::{new_enum}" + + + #Reset to original ENUM type + op.execute(pgsql_change_type('citation_target', 'status', 'target_status_type')) + op.execute(pgsql_change_type('citation_target_version', 'status', 'target_status_type')) + op.execute(pgsql_change_type('citation', 'status', 'citation_status_type')) + op.execute(pgsql_change_type('citation_version', 'status', 'citation_status_type')) + + #Set content=raw_content so that the content column matches what it would be before this revision + op.execute(psql_copy_column_values('citation', 'raw_content', 'content')) + op.execute(psql_copy_column_values('citation_version', 'raw_content', 'content')) + + #Drop the raw_citation_column + op.drop_column('citation','raw_content') + op.drop_column('citation_version','raw_content') + + #DROP old (SANITIZED) ENUM types + op.execute("DROP TYPE target_status_type_old") + op.execute("DROP TYPE citation_status_type_old") + + \ No newline at end of file diff --git a/alembic/versions/cd06eee98a8a_doi_sanitize_and_associated.py b/alembic/versions/cd06eee98a8a_doi_sanitize_and_associated.py new file mode 100644 index 0000000..6cd75f8 --- /dev/null +++ b/alembic/versions/cd06eee98a8a_doi_sanitize_and_associated.py @@ -0,0 +1,24 @@ +"""doi_sanitize_and_associated + +Revision ID: cd06eee98a8a +Revises: 8e83568feb1d, 98cb3e36b2da +Create Date: 2022-05-16 10:35:34.117170 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'cd06eee98a8a' +down_revision = ('8e83568feb1d', '98cb3e36b2da') +branch_labels = None +depends_on = None + + +def upgrade(): + pass + + +def downgrade(): + pass diff --git a/run.py b/run.py index 2fdf638..6be2fe8 100755 --- a/run.py +++ b/run.py @@ -318,8 +318,8 @@ def _build_diagnostics(bibcodes=None, json_payloads=None): maintenance_parser.add_argument( '--json', dest='json_payload', - nargs='+', action='store', + nargs='+', default=None, help='Space delimited list of json curated metadata') maintenance_parser.add_argument('--input_filename',