Merge pull request #1189 from microbiomedata/issue-1175-napa-testing

prep for migration action: query schema and data
microbiomedata · Oct 13, 2023 · 318c358 · 318c358
2 parents d74b0d6 + 0804156
commit 318c358
Show file tree

Hide file tree

Showing 6 changed files with 87 additions and 211 deletions.
diff --git a/.github/workflows/migration.yaml b/.github/workflows/migration.yaml
@@ -2,9 +2,7 @@ name: migration-with-java-action
 on: workflow_dispatch
 # wget or curl?! which is noisier
 # combine run statements?
-# could run jobs in parallel
-# tdb1.xloader     tdb2.tdbcompact  tdb2.tdbloader   tdb2.tdbstats    tdb2.xloader     tdbdump          tdbloader2       tdbstats
-# tdb2.tdbbackup   tdb2.tdbdump     tdb2.tdbquery    tdb2.tdbupdate   tdbbackup        tdbloader        tdbquery         tdbupdate
+# could run some jobs in parallel
 jobs:
   migration-with-java-job:
     runs-on: ubuntu-latest
@@ -27,5 +25,8 @@ jobs:
       #      - run: ls -l project/owl/nmdc.ofn
       - uses: foooomio/setup-jena@v2
       - run: make squeaky-clean all test
-      - run: tdb2.tdbloader --loc=tdbcontent --graph=http://example.com/tdbcontent/ project/owl/nmdc.owl.ttl
+      - run: make make-rdf
+      - run: tdb2.tdbloader --loc=tdbcontent --graph=https://w3id.org/nmdc/nmdc project/owl/nmdc.owl.ttl
+      - run: tdb2.tdbloader --loc=tdbcontent --graph=mongodb://mongo-loadbalancer.nmdc.production.svc.spin.nersc.gov:27017 local/mongo_as_nmdc_database_cuire_repaired.ttl
       - run: tdb2.tdbquery  --loc=tdbcontent --query=assets/sparql/tdb-test.rq
+      - run: tdb2.tdbquery  --loc=tdbcontent --query=assets/sparql/tdb-graph-list.rq
diff --git a/assets/sparql/tdb-graph-list.rq b/assets/sparql/tdb-graph-list.rq
@@ -0,0 +1,16 @@
+prefix dcterms: <http://purl.org/dc/terms/>
+prefix linkml: <https://w3id.org/linkml/>
+prefix mixs: <https://w3id.org/mixs/>
+prefix nmdc: <https://w3id.org/nmdc/>
+prefix owl: <http://www.w3.org/2002/07/owl#>
+prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+prefix skos: <http://www.w3.org/2004/02/skos/core#>
+prefix tdbcontent:  <http://example.com/tdbcontent/>
+prefix src-schema:  <https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/src/schema/>
+select distinct ?g
+where {
+    graph ?g {
+        ?s ?p ?o .
+    }
+}
diff --git a/nmdc_schema/get-element-names.py b/nmdc_schema/get-element-names.py
diff --git a/nmdc_schema/migration_recursion.py b/nmdc_schema/migration_recursion.py
@@ -14,98 +14,11 @@
 doi_url_pattern = r'^https?:\/\/[a-zA-Z\.]+\/10\.'
 curie_pattern = r'^[a-zA-Z_][a-zA-Z0-9_-]*:[a-zA-Z0-9_][a-zA-Z0-9_.-]*$'
 
-#migration_plan = {
-#
-#}
-
 
 class Migrator:
     def __init__(self):
         self.forced_prefix = None
 
-    # # this migration of doi slots in Study objects had been completed
-    # def migrate_studies_7_7_2_to_7_8(self, retrieved_study):
-    #     logger.info(f"Starting migration of {retrieved_study['id']}")
-    #     if "doi" in retrieved_study:
-    #         # logger.info(f"Before migration: {pprint.pformat(retrieved_study['doi'])}")
-    #
-    #         match = re.search(doi_url_pattern, retrieved_study["doi"]['has_raw_value'])
-    #         if match:
-    #             start_index = match.end()
-    #             as_curie = f"doi:10.{retrieved_study['doi']['has_raw_value'][start_index:]}"
-    #             retrieved_study["award_dois"] = [as_curie]
-    #         del retrieved_study["doi"]
-    #     return retrieved_study
-
-    def migrate_extractions_7_8_0_to_8_0_0(self, retrieved_extraction):
-        logger.info(f"Starting migration of {retrieved_extraction['id']}")
-
-        # change slot name from sample_mass to input_mass
-        if "sample_mass" in retrieved_extraction:
-            retrieved_extraction['input_mass'] = retrieved_extraction.pop('sample_mass')
-        return retrieved_extraction
-
-    def migrate_uc_gold_sequencing_project_identifiers_7_8_0_to_8_0_0(self, retrieved_omics_processing):
-
-        migrated_gold_identifiers = []
-        if "gold_sequencing_project_identifiers" in retrieved_omics_processing and retrieved_omics_processing[
-            "gold_sequencing_project_identifiers"]:
-            for i in retrieved_omics_processing["gold_sequencing_project_identifiers"]:
-                logger.info(f"migrating gold_sequencing_project_identifiers {i}")
-                curie_parts = i.split(':')
-                curie_prefix = curie_parts[0]
-                curie_local_id = curie_parts[1]
-
-                if curie_prefix == "GOLD":
-                    migrated_gold_identifiers.append(f"gold:{curie_local_id}")
-                else:
-                    migrated_gold_identifiers.append(i)
-        retrieved_omics_processing["gold_sequencing_project_identifiers"] = migrated_gold_identifiers
-
-        return retrieved_omics_processing
-
-    def migrate_uc_gold_biosample_identifiers_7_8_0_to_8_0_0(self, retrieved_biosample):
-
-        # todo refactor? this shouldn't be long-lived code
-
-        migrated_gold_identifiers = []
-        if "gold_biosample_identifiers" in retrieved_biosample and retrieved_biosample[
-            "gold_biosample_identifiers"]:
-            for i in retrieved_biosample["gold_biosample_identifiers"]:
-                logger.info(f"migrating gold_biosample_identifiers {i}")
-                curie_parts = i.split(':')
-                curie_prefix = curie_parts[0]
-                curie_local_id = curie_parts[1]
-
-                if curie_prefix == "GOLD":
-                    migrated_gold_identifiers.append(f"gold:{curie_local_id}")
-                else:
-                    migrated_gold_identifiers.append(i)
-        retrieved_biosample["gold_biosample_identifiers"] = migrated_gold_identifiers
-
-        return retrieved_biosample
-
-    def migrate_uc_gold_study_identifiers_7_8_0_to_8_0_0(self, retrieved_study):
-
-        # todo refactor? this shouldn't be long-lived code
-
-        migrated_gold_identifiers = []
-        if "gold_study_identifiers" in retrieved_study and retrieved_study[
-            "gold_study_identifiers"]:
-            for i in retrieved_study["gold_study_identifiers"]:
-                logger.info(f"migrating gold_study_identifiers {i}")
-                curie_parts = i.split(':')
-                curie_prefix = curie_parts[0]
-                curie_local_id = curie_parts[1]
-
-                if curie_prefix == "GOLD":
-                    migrated_gold_identifiers.append(f"gold:{curie_local_id}")
-                else:
-                    migrated_gold_identifiers.append(i)
-        retrieved_study["gold_study_identifiers"] = migrated_gold_identifiers
-
-        return retrieved_study
-
     def check_and_normalize_one_curie(self, curie_string):
         if not self.is_valid_curie(curie_string):
             curie_string = self.normalize_curie(curie_string)
@@ -198,34 +111,6 @@ def main(schema_path, input_path, output_path, salvage_prefix):
     for tdk, tdv in total_dict.items():
         logger.info(f"Starting migration of {tdk}")
         end_dict[tdk] = migrator.apply_changes_recursively_by_key(tdv, set(migrateable_slots))
-
-        # if tdk == "study_set":
-        #     logger.info(f"Starting {tdk}-specific migrations")
-        #     for current_study in tdv:
-        #         migrator.migrate_studies_7_7_2_to_7_8(current_study)
-
-        ####
-
-        # if tdk == "biosample_set":
-        #     logger.info(f"Starting {tdk}-specific migrations")
-        #     for current_biosample in tdv:
-        #         migrator.migrate_uc_gold_biosample_identifiers_7_8_0_to_8_0_0(current_biosample)
-
-        # if tdk == "extraction_set":
-        #     logger.info(f"Starting {tdk}-specific migrations")
-        #     for current_extraction in tdv:
-        #         migrator.migrate_extractions_7_8_0_to_8_0_0(current_extraction)
-
-        # if tdk == "omics_processing_set":
-        #     logger.info(f"Starting {tdk}-specific migrations")
-        #     for current_omics_processing in tdv:
-        #         migrator.migrate_uc_gold_sequencing_project_identifiers_7_8_0_to_8_0_0(current_omics_processing)
-
-        # if tdk == "study_set":
-        #     logger.info(f"Starting {tdk}-specific migrations")
-        #     for current_study in tdv:
-        #         migrator.migrate_uc_gold_study_identifiers_7_8_0_to_8_0_0(current_study)
-
 
     logger.info(f"Saving migrated data to {output_path}")
     with open(output_path, "w") as f:

diff --git a/nmdc_schema/mongo_dump_api_emph.py b/nmdc_schema/mongo_dump_api_emph.py
@@ -236,6 +236,7 @@ def get_class_slots(self, class_name, include_scalars=False):
                    'https://api-dev.microbiomedata.org/docs or https://api.microbiomedata.org/docs')
 @click.option('--endpoint-prefix', default="nmdcschema",
               show_default=True, help='FastAPI path component between the URL and the endpoint name')
+@click.option('--collection-check/--skip-collection-check', default=True)
 def cli(
         admin_db,
         env_file,
@@ -250,52 +251,59 @@ def cli(
         max_docs_per_coll,
         page_size,
         output_yaml,
+        collection_check,
 
 ):
-    nmdc_pymongo_client = PyMongoClient(
-        admin_db=admin_db,
-        auth_mechanism='SCRAM-SHA-256',
-        direct_connection=True,
-        env_file=env_file,
-        mongo_db_name=mongo_db_name,
-        mongo_host=mongo_host,
-        mongo_port=mongo_port,
-    )
-    # logger.info(f"{nmdc_pymongo_client.collections = }")
-
-    nmdc_helper = ViewHelper(schema_file)
-
-    # logger.info(f"{nmdc_helper.view.schema.name = }")
-
-    root_class_slots = nmdc_helper.get_class_slots(root_class)
-
-    # logger.info(f"{root_class_slots = }")
-
-    schema_vs_mongo_collections = set_arithmetic(set(root_class_slots), set(nmdc_pymongo_client.collections),
-                                                 set1_name='schema',
-                                                 set2_name='mongo')
-
-    logger.info(f"schema_vs_mongo_collections = ")
-    logger.info(pprint.pformat(schema_vs_mongo_collections))
-
-    available_selected_collections = []
-    if len(selected_collections) > 0:
-        available_vs_selected_collections = set_arithmetic(set(schema_vs_mongo_collections['intersection']),
-                                                           set(selected_collections), set1_name='available',
-                                                           set2_name='selected')
-        logger.debug(f"available_vs_selected_collections = ")
-        logger.debug(pprint.pformat(available_vs_selected_collections))
-        available_selected_collections = available_vs_selected_collections['intersection']
-        if available_vs_selected_collections['selected only']:
-            logger.warning(
-                f"Some requested collections are not available: {available_vs_selected_collections['selected only']}")
+    # selected_collections = []
+    est_doc_count = 0
+    if collection_check:
+        nmdc_pymongo_client = PyMongoClient(
+            admin_db=admin_db,
+            auth_mechanism='SCRAM-SHA-256',
+            direct_connection=True,
+            env_file=env_file,
+            mongo_db_name=mongo_db_name,
+            mongo_host=mongo_host,
+            mongo_port=mongo_port,
+        )
+        # logger.info(f"{nmdc_pymongo_client.collections = }")
+
+        nmdc_helper = ViewHelper(schema_file)
+
+        # logger.info(f"{nmdc_helper.view.schema.name = }")
+
+        root_class_slots = nmdc_helper.get_class_slots(root_class)
+
+        # logger.info(f"{root_class_slots = }")
+
+        schema_vs_mongo_collections = set_arithmetic(set(root_class_slots), set(nmdc_pymongo_client.collections),
+                                                     set1_name='schema',
+                                                     set2_name='mongo')
+
+        logger.info(f"schema_vs_mongo_collections = ")
+        logger.info(pprint.pformat(schema_vs_mongo_collections))
+
+        available_selected_collections = []
+        if len(selected_collections) > 0:
+            available_vs_selected_collections = set_arithmetic(set(schema_vs_mongo_collections['intersection']),
+                                                               set(selected_collections), set1_name='available',
+                                                               set2_name='selected')
+            logger.debug(f"available_vs_selected_collections = ")
+            logger.debug(pprint.pformat(available_vs_selected_collections))
+            available_selected_collections = available_vs_selected_collections['intersection']
+            if available_vs_selected_collections['selected only']:
+                logger.warning(
+                    f"Some requested collections are not available: {available_vs_selected_collections['selected only']}")
+        else:
+            available_selected_collections = schema_vs_mongo_collections['intersection']
+        available_selected_collections.sort()
+        logger.info(f"available_selected_collections = ")
+        logger.info(pprint.pformat(available_selected_collections))
+
+        selected_collections = available_selected_collections
     else:
-        available_selected_collections = schema_vs_mongo_collections['intersection']
-    available_selected_collections.sort()
-    logger.info(f"available_selected_collections = ")
-    logger.info(pprint.pformat(available_selected_collections))
-
-    nmdc_pymongo_client.selected_collections = available_selected_collections
+        selected_collections = selected_collections
+        logger.info(f"{selected_collections = }")
 
     # collection_stats = nmdc_pymongo_client.get_collection_stats()
 
@@ -305,14 +313,16 @@ def cli(
         page_size = max_docs_per_coll
 
     nmdc_database_object = {}
-    for current_collection in nmdc_pymongo_client.selected_collections:
-        logger.info(f"Attempting to get collection stats from {current_collection}")
+    for current_collection in selected_collections:
 
-        current_coll_obj = nmdc_pymongo_client.db[current_collection]
-        est_doc_count = current_coll_obj.estimated_document_count()
+        if collection_check:
+            logger.info(f"Attempting to get collection stats from {current_collection}")
 
-        logger.info(
-            f"estimated_document_count = {est_doc_count}")  # it would also be nice to report collection size or avg size/doc but I haven't figured how to do that quickly yet
+            current_coll_obj = nmdc_pymongo_client.db[current_collection]
+            est_doc_count = current_coll_obj.estimated_document_count()
+
+            logger.info(
+                f"estimated_document_count = {est_doc_count}")  # it would also be nice to report collection size or avg size/doc but I haven't figured how to do that quickly yet
 
         #     # collection_stats = current_coll_obj.estimated_document_count()
         #     collection_stats = current_coll_obj.stats()

diff --git a/project.Makefile b/project.Makefile
@@ -406,56 +406,25 @@ nmdc_schema/nmdc_schema_accepting_legacy_ids.py: nmdc_schema/nmdc_schema_accepti
 
 # ----
 
-# recommended setup:
-#   1. . ~/sshproxy.sh -u {YOUR_NERSC_USERNAME}
-#   2. ssh -i ~/.ssh/nersc -L27777:mongo-loadbalancer.nmdc.production.svc.spin.nersc.org:27017 -o ServerAliveInterval=60 {YOUR_NERSC_USERNAME}@dtn01.nersc.gov
-
 # todo mongodb collection stats vs Database slots report
 # todo convert to json
 # todo compress large files
 # todo: switch to API method for getting collection names and stats: https://api.microbiomedata.org/nmdcschema/collection_stats # partially implemented
 
 make-rdf: rdf-clean local/mongo_as_nmdc_database_validation.log local/mongo_as_nmdc_database_cuire_repaired.ttl
 
-temp:
-
-#   		--selected-collections functional_annotation_agg \ # huge, no publicly available reference data (kegg)
-#   		--selected-collections metaproteomics_analysis_activity_set \ # next slowest
-
-# when connecting to the dev MongoDB, also use --client-base-url https://api-dev.microbiomedata.org
-# make pre-composed prod and dev makefile tasks! since they require the user to provide a matching mongo port and api url
-
 local/mongo_as_unvalidated_nmdc_database.yaml:
 	date  # 276.50 seconds on 2023-08-30 without functional_annotation_agg or metaproteomics_analysis_activity_set
 	time $(RUN) pure-export \
-		--client-base-url https://api.microbiomedata.org \
+		--client-base-url https://api-napa.microbiomedata.org \
 		--endpoint-prefix nmdcschema \
-		--env-file local/.env \
 		--max-docs-per-coll 10000000 \
-		--mongo-db-name nmdc \
-		--mongo-host localhost \
-		--mongo-port 27777 \
 		--output-yaml $@ \
 		--page-size 10000 \
 		--schema-file src/schema/nmdc.yaml \
 		--selected-collections biosample_set \
-		--selected-collections data_object_set \
-		--selected-collections extraction_set \
-		--selected-collections field_research_site_set \
-		--selected-collections library_preparation_set \
-		--selected-collections mags_activity_set \
-		--selected-collections metabolomics_analysis_activity_set \
-		--selected-collections metagenome_annotation_activity_set \
-		--selected-collections metagenome_assembly_set \
-		--selected-collections metagenome_sequencing_activity_set  \
-		--selected-collections metatranscriptome_activity_set \
-		--selected-collections nom_analysis_activity_set \
-		--selected-collections omics_processing_set \
-		--selected-collections pooling_set \
-		--selected-collections processed_sample_set \
-		--selected-collections read_based_taxonomy_analysis_activity_set \
-		--selected-collections read_qc_analysis_activity_set \
-		--selected-collections study_set
+		--selected-collections study_set \
+		--skip-collection-check
 
 
 local/mongo_as_nmdc_database_rdf_safe.yaml: nmdc_schema/nmdc_schema_accepting_legacy_ids.yaml local/mongo_as_unvalidated_nmdc_database.yaml
@@ -471,16 +440,11 @@ local/mongo_as_nmdc_database_validation.log: nmdc_schema/nmdc_schema_accepting_l
 	date # 5m57.559s without functional_annotation_agg or metaproteomics_analysis_activity_set
 	time $(RUN) linkml-validate --schema $^ > $@
 
-# from riot:
-#   WARNING: java.io.tmpdir directory does not exist
-
 local/mongo_as_nmdc_database.ttl: nmdc_schema/nmdc_schema_accepting_legacy_ids.yaml local/mongo_as_nmdc_database_rdf_safe.yaml
 	date # 681.99 seconds on 2023-08-30 without functional_annotation_agg or metaproteomics_analysis_activity_set
 	time $(RUN) linkml-convert --output $@ --schema $^
-	export _JAVA_OPTIONS=-Djava.io.tmpdir=local
-	- $(RIOT) --validate $@ # < 1 minute
-
-# todo: still getting anyurl typed string statement objects in RDF. I added a workarround in anyuri-strings-to-iris
+#	export _JAVA_OPTIONS=-Djava.io.tmpdir=local
+#	- $(RIOT) --validate $@ # < 1 minute
 
 local/mongo_as_nmdc_database_cuire_repaired.ttl: local/mongo_as_nmdc_database.ttl
 	date
@@ -489,8 +453,8 @@ local/mongo_as_nmdc_database_cuire_repaired.ttl: local/mongo_as_nmdc_database.tt
 		--jsonld-context-jsons project/jsonld/nmdc.context.jsonld \
 		--emsl-biosample-uuid-replacement emsl_biosample_uuid_like \
 		--output-ttl $@
-	export _JAVA_OPTIONS=-Djava.io.tmpdir=local
-	- $(RIOT) --validate $@ # < 1 minute
+#	export _JAVA_OPTIONS=-Djava.io.tmpdir=local
+#	- $(RIOT) --validate $@ # < 1 minute
 	date
 
 # ----