From ecd4a09ced3e722f7d02eb8c740ada8a334daee3 Mon Sep 17 00:00:00 2001 From: David Keator Date: Mon, 19 Jul 2021 18:35:11 -0700 Subject: [PATCH] updated the way data element UUIDs are created, now using dataset DOI as part of the hash if exists in BIDS dataset. For csv2nidm user can specify a dataset ID which will be used. Also updated version number and click functionality of pynidm linear regression tool so its accessible with pynidm linreg --- nidm/core/BIDS_Constants.py | 6 +++++- nidm/core/Constants.py | 12 ++++++++--- nidm/experiment/Utils.py | 22 ++++++++++++-------- nidm/experiment/tools/bidsmri2nidm.py | 30 +++++++++++++++++++++------ nidm/experiment/tools/click_main.py | 1 + nidm/experiment/tools/csv2nidm.py | 10 +++++++-- nidm/experiment/tools/nidm_linreg.py | 4 ++-- nidm/version.py | 2 +- 8 files changed, 63 insertions(+), 24 deletions(-) diff --git a/nidm/core/BIDS_Constants.py b/nidm/core/BIDS_Constants.py index 9c05b5f5..b26b51ca 100644 --- a/nidm/core/BIDS_Constants.py +++ b/nidm/core/BIDS_Constants.py @@ -10,7 +10,11 @@ "Procedure" : Constants.NIDM_PROJECT_DESCRIPTION, "License" : Constants.NIDM_PROJECT_LICENSE, "ReferencesAndLinks" : Constants.NIDM_PROJECT_REFERENCES, - "Authors" : Constants.NIDM_AUTHOR + "Authors" : Constants.NIDM_AUTHOR, + "DatasetDOI" : Constants.NIDM_DOI, + "Funding" : Constants.NIDM_FUNDING, + "HowToAcknowledge" : Constants.NIDM_ACKNOWLEDGEMENTS + } #BIDS Participants file -> NIDM constants mappings diff --git a/nidm/core/Constants.py b/nidm/core/Constants.py index 11b83fad..9c6f727c 100644 --- a/nidm/core/Constants.py +++ b/nidm/core/Constants.py @@ -61,7 +61,8 @@ ONLI = Namespace("http://neurolog.unice.fr/ontoneurolog/v3.0/instrument.owl#") PATO = Namespace("http://purl.obolibrary.org/obo/pato#") DATALAD = Namespace("http://datasets.datalad.org/") -INTERLEX = Namespace("http://uri.interlex.org/base/") +INTERLEX = Namespace("http://uri.interlex.org/") +EDAM = Namespace("https://bioportal.bioontology.org/ontologies/EDAM") namespaces = { # "prov": PROV, @@ -95,7 +96,8 @@ "onli" : ONLI, "pato" : PATO, "datalad" : DATALAD, - "ilx" : INTERLEX + "ilx" : INTERLEX, + "edam" : EDAM } # Empty graph used to compute qnames @@ -386,6 +388,9 @@ def __init__(self, namespaces=None): NIDM_MRI_ASL = QualifiedName(provNamespace("nidm",NIDM),"ArterialSpinLabeling") CRYPTO_SHA512 =QualifiedName(provNamespace("crypto", CRYPTO),"sha512") DATALAD_LOCATION = QualifiedName(provNamespace("datalad", DATALAD),"Location") +NIDM_DOI = QualifiedName(provNamespace("edam",EDAM),"data_1188") +NIDM_FUNDING = QualifiedName(provNamespace("obo",OBO),"IAO_0000623") +NIDM_ACKNOWLEDGEMENTS = QualifiedName(provNamespace("obo",OBO),"IAO_0000324") ############################################################################## # OBO constants OBO_EXAMPLE = OBO['IAO_0000112'] @@ -609,7 +614,8 @@ def __init__(self, namespaces=None): NIDM_MRI_T2_STAR, NIDM_MRI_DIFFUSION_TENSOR, NIDM_MRI_FLOW, -NIDM_MRI_BOLD_EVENTS] +NIDM_MRI_BOLD_EVENTS, +NIDM_DOI] # Common isAbout URIs NIDM_IS_ABOUT_AGE = str(INTERLEX['ilx_0100400']) diff --git a/nidm/experiment/Utils.py b/nidm/experiment/Utils.py index 619998ad..c6bc125f 100644 --- a/nidm/experiment/Utils.py +++ b/nidm/experiment/Utils.py @@ -955,7 +955,7 @@ def redcap_datadictionary_to_json(redcap_dd_file,assessment_name): def map_variables_to_terms(df,directory, assessment_name, output_file=None,json_source=None,bids=False,owl_file='nidm', - associate_concepts=True): + associate_concepts=True, dataset_identifier=None): ''' :param df: data frame with first row containing variable names @@ -1253,7 +1253,7 @@ def map_variables_to_terms(df,directory, assessment_name, output_file=None,json_ write_json_mapping_file(column_to_terms, output_file, bids) # get CDEs for data dictonary and NIDM graph entity of data - cde = DD_to_nidm(column_to_terms) + cde = DD_to_nidm(column_to_terms,dataset_identifier=dataset_identifier) return [column_to_terms, cde] @@ -1652,7 +1652,7 @@ def annotate_data_element(source_variable, current_tuple, source_variable_annota print("levels: %s" % source_variable_annotations[current_tuple]['levels']) print("---------------------------------------------------------------------------------------") -def DD_UUID (element,dd_struct): +def DD_UUID (element,dd_struct,dataset_identifier=None): ''' This function will produce a hash of the data dictionary (personal data element) properties defined by the user for use as a UUID. The data dictionary key is a tuple identifying the file and variable @@ -1669,7 +1669,11 @@ def DD_UUID (element,dd_struct): # added getUUID to property string to solve problem where all openneuro datasets that have the same # source variable name and properties don't end up having the same UUID as they are sometimes not # the same and end up being added to the same entity when merging graphs across all openneuro projects - property_string=getUUID() + # if a dataset identifier is not provided then we use a random UUID + if dataset_identifier is not None: + property_string = dataset_identifier + else: + property_string = getUUID() for key, value in dd_struct[str(key_tuple)].items(): if key == 'label': property_string = property_string + str(value) @@ -1688,7 +1692,7 @@ def DD_UUID (element,dd_struct): cde_id = URIRef(niiri_ns + safe_string(variable_name) + "_" + str(crc32hash)) return cde_id -def DD_to_nidm(dd_struct): +def DD_to_nidm(dd_struct,dataset_identifier=None): ''' Takes a DD json structure and returns nidm CDE-style graph to be added to NIDM documents @@ -1735,7 +1739,7 @@ def DD_to_nidm(dd_struct): # md5hash = hashlib.md5(str(key).encode()).hexdigest() - cde_id = DD_UUID(key,dd_struct) + cde_id = DD_UUID(key,dd_struct,dataset_identifier) #cde_id = URIRef(niiri_ns + safe_string(item) + "_" + str(crc32hash)) g.add((cde_id,RDF.type, Constants.NIDM['PersonalDataElement'])) g.add((cde_id,RDF.type, Constants.PROV['Entity'])) @@ -1757,7 +1761,7 @@ def DD_to_nidm(dd_struct): elif (key == 'levels') or (key == 'Levels'): g.add((cde_id,Constants.NIDM['levels'],Literal(value))) elif key == 'source_variable': - g.add((cde_id, Constants.NIDM['source_variable'], Literal(value))) + g.add((cde_id, Constants.NIDM['sourceVariable'], Literal(value))) elif key == 'isAbout': #dct_ns = Namespace(Constants.DCT) #g.bind(prefix='dct', namespace=dct_ns) @@ -1802,7 +1806,7 @@ def DD_to_nidm(dd_struct): elif (key == 'maxValue') or (key == 'maximumValue'): g.add((cde_id, Constants.NIDM['maxValue'], Literal(value))) elif key == 'hasUnit': - g.add((cde_id, Constants.NIDM['hasUnit'], Literal(value))) + g.add((cde_id, Constants.NIDM['unitCode'], Literal(value))) elif key == 'sameAs': g.add((cde_id, Constants.NIDM['sameAs'], URIRef(value))) elif key == 'associatedWith': @@ -1820,7 +1824,7 @@ def add_attributes_with_cde(prov_object, cde, row_variable, value): # find the ID in cdes where nidm:source_variable matches the row_variable # qres = cde.subjects(predicate=Constants.RDFS['label'],object=Literal(row_variable)) - qres = cde.subjects(predicate=Constants.NIDM['source_variable'],object=Literal(row_variable)) + qres = cde.subjects(predicate=Constants.NIDM['sourceVariable'],object=Literal(row_variable)) for s in qres: entity_id = s # find prefix matching our url in rdflib graph...this is because we're bouncing between diff --git a/nidm/experiment/tools/bidsmri2nidm.py b/nidm/experiment/tools/bidsmri2nidm.py index ed7432eb..6386865a 100755 --- a/nidm/experiment/tools/bidsmri2nidm.py +++ b/nidm/experiment/tools/bidsmri2nidm.py @@ -575,6 +575,18 @@ def bidsmri2project(directory, args): else: project.add_attributes({BIDS_Constants.dataset_description[key]:dataset[key]}) + # added special case to include DOI of project in hash for data element UUIDs to prevent collisions with + # similar data elements from other projects and make the bids2nidm conversion deterministic in the sense + # that if you re-convert the same dataset to NIDM, the data element UUIDs will remain the same. + if key == "DatasetDOI": + if dataset[key] == "": + dataset_doi = None + else: + dataset_doi = dataset[key] + else: + dataset_doi = None + + @@ -615,28 +627,34 @@ def bidsmri2project(directory, args): temp=DataFrame(columns=mapping_list) if args.no_concepts: column_to_terms,cde = map_variables_to_terms(directory=directory,assessment_name='participants.tsv', - df=temp,output_file=os.path.join(directory,'participants.json'),bids=True,associate_concepts=False) + df=temp,output_file=os.path.join(directory,'participants.json'),bids=True,associate_concepts=False, + dataset_identifier = dataset_doi) else: column_to_terms,cde = map_variables_to_terms(directory=directory,assessment_name='participants.tsv', - df=temp,output_file=os.path.join(directory,'participants.json'),bids=True) + df=temp,output_file=os.path.join(directory,'participants.json'),bids=True, + dataset_identifier = dataset_doi) else: #maps variables in CSV file to terms temp=DataFrame(columns=mapping_list) if args.no_concepts: column_to_terms,cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp, - output_file=os.path.join(directory,'participants.json'),json_source=os.path.join(directory,'participants.json'),bids=True,associate_concepts=False) + output_file=os.path.join(directory,'participants.json'),json_source=os.path.join(directory,'participants.json'), + bids=True,associate_concepts=False, dataset_identifier = dataset_doi) else: column_to_terms,cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp, - output_file=os.path.join(directory,'participants.json'),json_source=os.path.join(directory,'participants.json'),bids=True) + output_file=os.path.join(directory,'participants.json'),json_source=os.path.join(directory,'participants.json'), + bids=True,dataset_identifier = dataset_doi) else: #maps variables in CSV file to terms temp=DataFrame(columns=mapping_list) if args.no_concepts: column_to_terms, cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp, - output_file=os.path.join(directory,'participants.json'),json_source=args.json_map,bids=True,associate_concepts=False) + output_file=os.path.join(directory,'participants.json'),json_source=args.json_map,bids=True, + associate_concepts=False, dataset_identifier = dataset_doi) else: column_to_terms, cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp, - output_file=os.path.join(directory,'participants.json'),json_source=args.json_map,bids=True) + output_file=os.path.join(directory,'participants.json'),json_source=args.json_map,bids=True, + dataset_identifier = dataset_doi) for row in participants_data: diff --git a/nidm/experiment/tools/click_main.py b/nidm/experiment/tools/click_main.py index e98f8f65..f039435f 100644 --- a/nidm/experiment/tools/click_main.py +++ b/nidm/experiment/tools/click_main.py @@ -5,3 +5,4 @@ from nidm.experiment.tools import nidm_concat from nidm.experiment.tools import nidm_merge from nidm.experiment.tools import nidm_convert +from nidm.experiment.tools import nidm_linreg diff --git a/nidm/experiment/tools/csv2nidm.py b/nidm/experiment/tools/csv2nidm.py index 5ab91c48..52a14ec3 100644 --- a/nidm/experiment/tools/csv2nidm.py +++ b/nidm/experiment/tools/csv2nidm.py @@ -92,6 +92,9 @@ def main(argv): 'asked of the user. This is useful if you already have a -json_map specified without concepts and want to' 'simply run this program to get a NIDM file with user interaction to associate concepts.') parser.add_argument('-log','--log', dest='logfile',required=False, default=None, help="full path to directory to save log file. Log file name is csv2nidm_[arg.csv_file].log") + parser.add_argument('-dataset_id', '--dataset_id', dest='dataset_identifier',required=False, default=None, + help='If this is provided, which can be any dataset ID although its suggested to use a dataset' + 'DOI if available, unique data element IDs will use this information as part of the hash.') parser.add_argument('-out', dest='output_file', required=True, help="Full path with filename to save NIDM file") args = parser.parse_args() @@ -120,12 +123,15 @@ def main(argv): #else: # if user did not specify -no_concepts then associate concepts interactively with user if not args.no_concepts: - column_to_terms, cde = map_variables_to_terms(df=df, assessment_name=basename(args.csv_file),directory=dirname(args.output_file), output_file=args.output_file, json_source=json_map) + column_to_terms, cde = map_variables_to_terms(df=df, assessment_name=basename(args.csv_file), + directory=dirname(args.output_file), output_file=args.output_file, + json_source=json_map,dataset_identifier=args.dataset_identifier) # run without concept mappings else: column_to_terms, cde = map_variables_to_terms(df=df, assessment_name=basename(args.csv_file), directory=dirname(args.output_file), output_file=args.output_file, - json_source=json_map, associate_concepts=False) + json_source=json_map, associate_concepts=False, + dataset_identifier=args.dataset_identifier) if args.logfile is not None: logging.basicConfig(filename=join(args.logfile,'csv2nidm_' + os.path.splitext(os.path.basename(args.csv_file))[0] + '.log'), level=logging.DEBUG) diff --git a/nidm/experiment/tools/nidm_linreg.py b/nidm/experiment/tools/nidm_linreg.py index 8c1c61d5..e73b9bce 100644 --- a/nidm/experiment/tools/nidm_linreg.py +++ b/nidm/experiment/tools/nidm_linreg.py @@ -68,7 +68,7 @@ help="Optional output file (TXT) to store results of the linear regression, contrast, and regularization") @click.option("--regularization", "-r", required=False, help="This parameter will return the results of the linear regression with L1 or L2 regularization depending on the type specified, and the weight with the maximum likelihood solution") -def full_regression(nidm_file_list, output_file, model, contrast, regularization): +def linreg(nidm_file_list, output_file, model, contrast, regularization): #NOTE: Every time I make a global variable, it is because I need it in at least one other method. global c #used in linreg(), contrasting() c = contrast #Storing all important parameters in global variables so they can be accessed in other methods @@ -645,4 +645,4 @@ def opencsv(data): # it can be used calling the script `python nidm_query.py -nl ... -q .. if __name__ == "__main__": - full_regression() \ No newline at end of file + linreg() diff --git a/nidm/version.py b/nidm/version.py index a26097f8..d5c4fbd2 100644 --- a/nidm/version.py +++ b/nidm/version.py @@ -4,7 +4,7 @@ # Format expected by setup.py and doc/source/conf.py: string of form "X.Y.Z" _version_major = 3 _version_minor = 8 -_version_micro = '1' # use '' for first of series, number for 1 and above +_version_micro = '2' # use '' for first of series, number for 1 and above _version_extra = '' # _version_extra = '' # Uncomment this for full releases