Skip to content

Commit

Permalink
Merge pull request #1189 from microbiomedata/issue-1175-napa-testing
Browse files Browse the repository at this point in the history
prep for migration action: query schema and data
  • Loading branch information
turbomam authored Oct 13, 2023
2 parents d74b0d6 + 0804156 commit 318c358
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 211 deletions.
9 changes: 5 additions & 4 deletions .github/workflows/migration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@ name: migration-with-java-action
on: workflow_dispatch
# wget or curl?! which is noisier
# combine run statements?
# could run jobs in parallel
# tdb1.xloader tdb2.tdbcompact tdb2.tdbloader tdb2.tdbstats tdb2.xloader tdbdump tdbloader2 tdbstats
# tdb2.tdbbackup tdb2.tdbdump tdb2.tdbquery tdb2.tdbupdate tdbbackup tdbloader tdbquery tdbupdate
# could run some jobs in parallel
jobs:
migration-with-java-job:
runs-on: ubuntu-latest
Expand All @@ -27,5 +25,8 @@ jobs:
# - run: ls -l project/owl/nmdc.ofn
- uses: foooomio/setup-jena@v2
- run: make squeaky-clean all test
- run: tdb2.tdbloader --loc=tdbcontent --graph=http://example.com/tdbcontent/ project/owl/nmdc.owl.ttl
- run: make make-rdf
- run: tdb2.tdbloader --loc=tdbcontent --graph=https://w3id.org/nmdc/nmdc project/owl/nmdc.owl.ttl
- run: tdb2.tdbloader --loc=tdbcontent --graph=mongodb://mongo-loadbalancer.nmdc.production.svc.spin.nersc.gov:27017 local/mongo_as_nmdc_database_cuire_repaired.ttl
- run: tdb2.tdbquery --loc=tdbcontent --query=assets/sparql/tdb-test.rq
- run: tdb2.tdbquery --loc=tdbcontent --query=assets/sparql/tdb-graph-list.rq
16 changes: 16 additions & 0 deletions assets/sparql/tdb-graph-list.rq
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
prefix dcterms: <http://purl.org/dc/terms/>
prefix linkml: <https://w3id.org/linkml/>
prefix mixs: <https://w3id.org/mixs/>
prefix nmdc: <https://w3id.org/nmdc/>
prefix owl: <http://www.w3.org/2002/07/owl#>
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix skos: <http://www.w3.org/2004/02/skos/core#>
prefix tdbcontent: <http://example.com/tdbcontent/>
prefix src-schema: <https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/src/schema/>
select distinct ?g
where {
graph ?g {
?s ?p ?o .
}
}
Empty file.
115 changes: 0 additions & 115 deletions nmdc_schema/migration_recursion.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,98 +14,11 @@
doi_url_pattern = r'^https?:\/\/[a-zA-Z\.]+\/10\.'
curie_pattern = r'^[a-zA-Z_][a-zA-Z0-9_-]*:[a-zA-Z0-9_][a-zA-Z0-9_.-]*$'

#migration_plan = {
#
#}


class Migrator:
def __init__(self):
self.forced_prefix = None

# # this migration of doi slots in Study objects had been completed
# def migrate_studies_7_7_2_to_7_8(self, retrieved_study):
# logger.info(f"Starting migration of {retrieved_study['id']}")
# if "doi" in retrieved_study:
# # logger.info(f"Before migration: {pprint.pformat(retrieved_study['doi'])}")
#
# match = re.search(doi_url_pattern, retrieved_study["doi"]['has_raw_value'])
# if match:
# start_index = match.end()
# as_curie = f"doi:10.{retrieved_study['doi']['has_raw_value'][start_index:]}"
# retrieved_study["award_dois"] = [as_curie]
# del retrieved_study["doi"]
# return retrieved_study

def migrate_extractions_7_8_0_to_8_0_0(self, retrieved_extraction):
logger.info(f"Starting migration of {retrieved_extraction['id']}")

# change slot name from sample_mass to input_mass
if "sample_mass" in retrieved_extraction:
retrieved_extraction['input_mass'] = retrieved_extraction.pop('sample_mass')
return retrieved_extraction

def migrate_uc_gold_sequencing_project_identifiers_7_8_0_to_8_0_0(self, retrieved_omics_processing):

migrated_gold_identifiers = []
if "gold_sequencing_project_identifiers" in retrieved_omics_processing and retrieved_omics_processing[
"gold_sequencing_project_identifiers"]:
for i in retrieved_omics_processing["gold_sequencing_project_identifiers"]:
logger.info(f"migrating gold_sequencing_project_identifiers {i}")
curie_parts = i.split(':')
curie_prefix = curie_parts[0]
curie_local_id = curie_parts[1]

if curie_prefix == "GOLD":
migrated_gold_identifiers.append(f"gold:{curie_local_id}")
else:
migrated_gold_identifiers.append(i)
retrieved_omics_processing["gold_sequencing_project_identifiers"] = migrated_gold_identifiers

return retrieved_omics_processing

def migrate_uc_gold_biosample_identifiers_7_8_0_to_8_0_0(self, retrieved_biosample):

# todo refactor? this shouldn't be long-lived code

migrated_gold_identifiers = []
if "gold_biosample_identifiers" in retrieved_biosample and retrieved_biosample[
"gold_biosample_identifiers"]:
for i in retrieved_biosample["gold_biosample_identifiers"]:
logger.info(f"migrating gold_biosample_identifiers {i}")
curie_parts = i.split(':')
curie_prefix = curie_parts[0]
curie_local_id = curie_parts[1]

if curie_prefix == "GOLD":
migrated_gold_identifiers.append(f"gold:{curie_local_id}")
else:
migrated_gold_identifiers.append(i)
retrieved_biosample["gold_biosample_identifiers"] = migrated_gold_identifiers

return retrieved_biosample

def migrate_uc_gold_study_identifiers_7_8_0_to_8_0_0(self, retrieved_study):

# todo refactor? this shouldn't be long-lived code

migrated_gold_identifiers = []
if "gold_study_identifiers" in retrieved_study and retrieved_study[
"gold_study_identifiers"]:
for i in retrieved_study["gold_study_identifiers"]:
logger.info(f"migrating gold_study_identifiers {i}")
curie_parts = i.split(':')
curie_prefix = curie_parts[0]
curie_local_id = curie_parts[1]

if curie_prefix == "GOLD":
migrated_gold_identifiers.append(f"gold:{curie_local_id}")
else:
migrated_gold_identifiers.append(i)
retrieved_study["gold_study_identifiers"] = migrated_gold_identifiers

return retrieved_study

def check_and_normalize_one_curie(self, curie_string):
if not self.is_valid_curie(curie_string):
curie_string = self.normalize_curie(curie_string)
Expand Down Expand Up @@ -198,34 +111,6 @@ def main(schema_path, input_path, output_path, salvage_prefix):
for tdk, tdv in total_dict.items():
logger.info(f"Starting migration of {tdk}")
end_dict[tdk] = migrator.apply_changes_recursively_by_key(tdv, set(migrateable_slots))

# if tdk == "study_set":
# logger.info(f"Starting {tdk}-specific migrations")
# for current_study in tdv:
# migrator.migrate_studies_7_7_2_to_7_8(current_study)

####

# if tdk == "biosample_set":
# logger.info(f"Starting {tdk}-specific migrations")
# for current_biosample in tdv:
# migrator.migrate_uc_gold_biosample_identifiers_7_8_0_to_8_0_0(current_biosample)

# if tdk == "extraction_set":
# logger.info(f"Starting {tdk}-specific migrations")
# for current_extraction in tdv:
# migrator.migrate_extractions_7_8_0_to_8_0_0(current_extraction)

# if tdk == "omics_processing_set":
# logger.info(f"Starting {tdk}-specific migrations")
# for current_omics_processing in tdv:
# migrator.migrate_uc_gold_sequencing_project_identifiers_7_8_0_to_8_0_0(current_omics_processing)

# if tdk == "study_set":
# logger.info(f"Starting {tdk}-specific migrations")
# for current_study in tdv:
# migrator.migrate_uc_gold_study_identifiers_7_8_0_to_8_0_0(current_study)


logger.info(f"Saving migrated data to {output_path}")
with open(output_path, "w") as f:
Expand Down
108 changes: 59 additions & 49 deletions nmdc_schema/mongo_dump_api_emph.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ def get_class_slots(self, class_name, include_scalars=False):
'https://api-dev.microbiomedata.org/docs or https://api.microbiomedata.org/docs')
@click.option('--endpoint-prefix', default="nmdcschema",
show_default=True, help='FastAPI path component between the URL and the endpoint name')
@click.option('--collection-check/--skip-collection-check', default=True)
def cli(
admin_db,
env_file,
Expand All @@ -250,52 +251,59 @@ def cli(
max_docs_per_coll,
page_size,
output_yaml,
collection_check,

):
nmdc_pymongo_client = PyMongoClient(
admin_db=admin_db,
auth_mechanism='SCRAM-SHA-256',
direct_connection=True,
env_file=env_file,
mongo_db_name=mongo_db_name,
mongo_host=mongo_host,
mongo_port=mongo_port,
)
# logger.info(f"{nmdc_pymongo_client.collections = }")

nmdc_helper = ViewHelper(schema_file)

# logger.info(f"{nmdc_helper.view.schema.name = }")

root_class_slots = nmdc_helper.get_class_slots(root_class)

# logger.info(f"{root_class_slots = }")

schema_vs_mongo_collections = set_arithmetic(set(root_class_slots), set(nmdc_pymongo_client.collections),
set1_name='schema',
set2_name='mongo')

logger.info(f"schema_vs_mongo_collections = ")
logger.info(pprint.pformat(schema_vs_mongo_collections))

available_selected_collections = []
if len(selected_collections) > 0:
available_vs_selected_collections = set_arithmetic(set(schema_vs_mongo_collections['intersection']),
set(selected_collections), set1_name='available',
set2_name='selected')
logger.debug(f"available_vs_selected_collections = ")
logger.debug(pprint.pformat(available_vs_selected_collections))
available_selected_collections = available_vs_selected_collections['intersection']
if available_vs_selected_collections['selected only']:
logger.warning(
f"Some requested collections are not available: {available_vs_selected_collections['selected only']}")
# selected_collections = []
est_doc_count = 0
if collection_check:
nmdc_pymongo_client = PyMongoClient(
admin_db=admin_db,
auth_mechanism='SCRAM-SHA-256',
direct_connection=True,
env_file=env_file,
mongo_db_name=mongo_db_name,
mongo_host=mongo_host,
mongo_port=mongo_port,
)
# logger.info(f"{nmdc_pymongo_client.collections = }")

nmdc_helper = ViewHelper(schema_file)

# logger.info(f"{nmdc_helper.view.schema.name = }")

root_class_slots = nmdc_helper.get_class_slots(root_class)

# logger.info(f"{root_class_slots = }")

schema_vs_mongo_collections = set_arithmetic(set(root_class_slots), set(nmdc_pymongo_client.collections),
set1_name='schema',
set2_name='mongo')

logger.info(f"schema_vs_mongo_collections = ")
logger.info(pprint.pformat(schema_vs_mongo_collections))

available_selected_collections = []
if len(selected_collections) > 0:
available_vs_selected_collections = set_arithmetic(set(schema_vs_mongo_collections['intersection']),
set(selected_collections), set1_name='available',
set2_name='selected')
logger.debug(f"available_vs_selected_collections = ")
logger.debug(pprint.pformat(available_vs_selected_collections))
available_selected_collections = available_vs_selected_collections['intersection']
if available_vs_selected_collections['selected only']:
logger.warning(
f"Some requested collections are not available: {available_vs_selected_collections['selected only']}")
else:
available_selected_collections = schema_vs_mongo_collections['intersection']
available_selected_collections.sort()
logger.info(f"available_selected_collections = ")
logger.info(pprint.pformat(available_selected_collections))

selected_collections = available_selected_collections
else:
available_selected_collections = schema_vs_mongo_collections['intersection']
available_selected_collections.sort()
logger.info(f"available_selected_collections = ")
logger.info(pprint.pformat(available_selected_collections))

nmdc_pymongo_client.selected_collections = available_selected_collections
selected_collections = selected_collections
logger.info(f"{selected_collections = }")

# collection_stats = nmdc_pymongo_client.get_collection_stats()

Expand All @@ -305,14 +313,16 @@ def cli(
page_size = max_docs_per_coll

nmdc_database_object = {}
for current_collection in nmdc_pymongo_client.selected_collections:
logger.info(f"Attempting to get collection stats from {current_collection}")
for current_collection in selected_collections:

current_coll_obj = nmdc_pymongo_client.db[current_collection]
est_doc_count = current_coll_obj.estimated_document_count()
if collection_check:
logger.info(f"Attempting to get collection stats from {current_collection}")

logger.info(
f"estimated_document_count = {est_doc_count}") # it would also be nice to report collection size or avg size/doc but I haven't figured how to do that quickly yet
current_coll_obj = nmdc_pymongo_client.db[current_collection]
est_doc_count = current_coll_obj.estimated_document_count()

logger.info(
f"estimated_document_count = {est_doc_count}") # it would also be nice to report collection size or avg size/doc but I haven't figured how to do that quickly yet

# # collection_stats = current_coll_obj.estimated_document_count()
# collection_stats = current_coll_obj.stats()
Expand Down
50 changes: 7 additions & 43 deletions project.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -406,56 +406,25 @@ nmdc_schema/nmdc_schema_accepting_legacy_ids.py: nmdc_schema/nmdc_schema_accepti

# ----

# recommended setup:
# 1. . ~/sshproxy.sh -u {YOUR_NERSC_USERNAME}
# 2. ssh -i ~/.ssh/nersc -L27777:mongo-loadbalancer.nmdc.production.svc.spin.nersc.org:27017 -o ServerAliveInterval=60 {YOUR_NERSC_USERNAME}@dtn01.nersc.gov

# todo mongodb collection stats vs Database slots report
# todo convert to json
# todo compress large files
# todo: switch to API method for getting collection names and stats: https://api.microbiomedata.org/nmdcschema/collection_stats # partially implemented

make-rdf: rdf-clean local/mongo_as_nmdc_database_validation.log local/mongo_as_nmdc_database_cuire_repaired.ttl

temp:

# --selected-collections functional_annotation_agg \ # huge, no publicly available reference data (kegg)
# --selected-collections metaproteomics_analysis_activity_set \ # next slowest

# when connecting to the dev MongoDB, also use --client-base-url https://api-dev.microbiomedata.org
# make pre-composed prod and dev makefile tasks! since they require the user to provide a matching mongo port and api url

local/mongo_as_unvalidated_nmdc_database.yaml:
date # 276.50 seconds on 2023-08-30 without functional_annotation_agg or metaproteomics_analysis_activity_set
time $(RUN) pure-export \
--client-base-url https://api.microbiomedata.org \
--client-base-url https://api-napa.microbiomedata.org \
--endpoint-prefix nmdcschema \
--env-file local/.env \
--max-docs-per-coll 10000000 \
--mongo-db-name nmdc \
--mongo-host localhost \
--mongo-port 27777 \
--output-yaml $@ \
--page-size 10000 \
--schema-file src/schema/nmdc.yaml \
--selected-collections biosample_set \
--selected-collections data_object_set \
--selected-collections extraction_set \
--selected-collections field_research_site_set \
--selected-collections library_preparation_set \
--selected-collections mags_activity_set \
--selected-collections metabolomics_analysis_activity_set \
--selected-collections metagenome_annotation_activity_set \
--selected-collections metagenome_assembly_set \
--selected-collections metagenome_sequencing_activity_set \
--selected-collections metatranscriptome_activity_set \
--selected-collections nom_analysis_activity_set \
--selected-collections omics_processing_set \
--selected-collections pooling_set \
--selected-collections processed_sample_set \
--selected-collections read_based_taxonomy_analysis_activity_set \
--selected-collections read_qc_analysis_activity_set \
--selected-collections study_set
--selected-collections study_set \
--skip-collection-check


local/mongo_as_nmdc_database_rdf_safe.yaml: nmdc_schema/nmdc_schema_accepting_legacy_ids.yaml local/mongo_as_unvalidated_nmdc_database.yaml
Expand All @@ -471,16 +440,11 @@ local/mongo_as_nmdc_database_validation.log: nmdc_schema/nmdc_schema_accepting_l
date # 5m57.559s without functional_annotation_agg or metaproteomics_analysis_activity_set
time $(RUN) linkml-validate --schema $^ > $@

# from riot:
# WARNING: java.io.tmpdir directory does not exist

local/mongo_as_nmdc_database.ttl: nmdc_schema/nmdc_schema_accepting_legacy_ids.yaml local/mongo_as_nmdc_database_rdf_safe.yaml
date # 681.99 seconds on 2023-08-30 without functional_annotation_agg or metaproteomics_analysis_activity_set
time $(RUN) linkml-convert --output $@ --schema $^
export _JAVA_OPTIONS=-Djava.io.tmpdir=local
- $(RIOT) --validate $@ # < 1 minute

# todo: still getting anyurl typed string statement objects in RDF. I added a workarround in anyuri-strings-to-iris
# export _JAVA_OPTIONS=-Djava.io.tmpdir=local
# - $(RIOT) --validate $@ # < 1 minute

local/mongo_as_nmdc_database_cuire_repaired.ttl: local/mongo_as_nmdc_database.ttl
date
Expand All @@ -489,8 +453,8 @@ local/mongo_as_nmdc_database_cuire_repaired.ttl: local/mongo_as_nmdc_database.tt
--jsonld-context-jsons project/jsonld/nmdc.context.jsonld \
--emsl-biosample-uuid-replacement emsl_biosample_uuid_like \
--output-ttl $@
export _JAVA_OPTIONS=-Djava.io.tmpdir=local
- $(RIOT) --validate $@ # < 1 minute
# export _JAVA_OPTIONS=-Djava.io.tmpdir=local
# - $(RIOT) --validate $@ # < 1 minute
date

# ----
Expand Down

0 comments on commit 318c358

Please sign in to comment.