Merge pull request #277 from dbkeator/master

updated the way data element UUIDs are created, now using dataset DOI…
incf-nidash · Jul 20, 2021 · 4e5f389 · 4e5f389
2 parents 46d6307 + ecd4a09
commit 4e5f389
Show file tree

Hide file tree

Showing 8 changed files with 63 additions and 24 deletions.
diff --git a/nidm/core/BIDS_Constants.py b/nidm/core/BIDS_Constants.py
@@ -10,7 +10,11 @@
     "Procedure" : Constants.NIDM_PROJECT_DESCRIPTION,
     "License" : Constants.NIDM_PROJECT_LICENSE,
     "ReferencesAndLinks" : Constants.NIDM_PROJECT_REFERENCES,
-    "Authors" : Constants.NIDM_AUTHOR
+    "Authors" : Constants.NIDM_AUTHOR,
+    "DatasetDOI" : Constants.NIDM_DOI,
+    "Funding" : Constants.NIDM_FUNDING,
+    "HowToAcknowledge" : Constants.NIDM_ACKNOWLEDGEMENTS
+
 }
 
 #BIDS Participants file -> NIDM constants mappings

diff --git a/nidm/core/Constants.py b/nidm/core/Constants.py
@@ -61,7 +61,8 @@
 ONLI = Namespace("http://neurolog.unice.fr/ontoneurolog/v3.0/instrument.owl#")
 PATO = Namespace("http://purl.obolibrary.org/obo/pato#")
 DATALAD = Namespace("http://datasets.datalad.org/")
-INTERLEX = Namespace("http://uri.interlex.org/base/")
+INTERLEX = Namespace("http://uri.interlex.org/")
+EDAM = Namespace("https://bioportal.bioontology.org/ontologies/EDAM")
 
 namespaces = {
    # "prov": PROV,
@@ -95,7 +96,8 @@
     "onli" : ONLI,
     "pato" : PATO,
 	"datalad" : DATALAD,
-	"ilx" : INTERLEX
+	"ilx" : INTERLEX,
+	"edam" : EDAM
     }
 
 # Empty graph used to compute qnames
@@ -386,6 +388,9 @@ def __init__(self, namespaces=None):
 NIDM_MRI_ASL = QualifiedName(provNamespace("nidm",NIDM),"ArterialSpinLabeling")
 CRYPTO_SHA512  =QualifiedName(provNamespace("crypto", CRYPTO),"sha512")
 DATALAD_LOCATION = QualifiedName(provNamespace("datalad", DATALAD),"Location")
+NIDM_DOI = QualifiedName(provNamespace("edam",EDAM),"data_1188")
+NIDM_FUNDING = QualifiedName(provNamespace("obo",OBO),"IAO_0000623")
+NIDM_ACKNOWLEDGEMENTS = QualifiedName(provNamespace("obo",OBO),"IAO_0000324")
 ##############################################################################
 # OBO constants
 OBO_EXAMPLE = OBO['IAO_0000112']
@@ -609,7 +614,8 @@ def __init__(self, namespaces=None):
 NIDM_MRI_T2_STAR,
 NIDM_MRI_DIFFUSION_TENSOR,
 NIDM_MRI_FLOW,
-NIDM_MRI_BOLD_EVENTS]
+NIDM_MRI_BOLD_EVENTS,
+NIDM_DOI]
 
 # Common isAbout URIs
 NIDM_IS_ABOUT_AGE = str(INTERLEX['ilx_0100400'])

diff --git a/nidm/experiment/Utils.py b/nidm/experiment/Utils.py
@@ -955,7 +955,7 @@ def redcap_datadictionary_to_json(redcap_dd_file,assessment_name):
 
 
 def map_variables_to_terms(df,directory, assessment_name, output_file=None,json_source=None,bids=False,owl_file='nidm',
-                           associate_concepts=True):
+                           associate_concepts=True, dataset_identifier=None):
     '''
 
     :param df: data frame with first row containing variable names
@@ -1253,7 +1253,7 @@ def map_variables_to_terms(df,directory, assessment_name, output_file=None,json_
     write_json_mapping_file(column_to_terms, output_file, bids)
 
     # get CDEs for data dictonary and NIDM graph entity of data
-    cde = DD_to_nidm(column_to_terms)
+    cde = DD_to_nidm(column_to_terms,dataset_identifier=dataset_identifier)
 
     return [column_to_terms, cde]
 
@@ -1652,7 +1652,7 @@ def annotate_data_element(source_variable, current_tuple, source_variable_annota
         print("levels: %s" % source_variable_annotations[current_tuple]['levels'])
     print("---------------------------------------------------------------------------------------")
 
-def DD_UUID (element,dd_struct):
+def DD_UUID (element,dd_struct,dataset_identifier=None):
     '''
     This function will produce a hash of the data dictionary (personal data element) properties defined
     by the user for use as a UUID.  The data dictionary key is a tuple identifying the file and variable
@@ -1669,7 +1669,11 @@ def DD_UUID (element,dd_struct):
     # added getUUID to property string to solve problem where all openneuro datasets that have the same
     # source variable name and properties don't end up having the same UUID as they are sometimes not
     # the same and end up being added to the same entity when merging graphs across all openneuro projects
-    property_string=getUUID()
+    # if a dataset identifier is not provided then we use a random UUID 
+    if dataset_identifier is not None:
+        property_string = dataset_identifier
+    else:
+        property_string = getUUID()
     for key, value in dd_struct[str(key_tuple)].items():
         if key == 'label':
             property_string = property_string + str(value)
@@ -1688,7 +1692,7 @@ def DD_UUID (element,dd_struct):
     cde_id = URIRef(niiri_ns + safe_string(variable_name) + "_" + str(crc32hash))
     return cde_id
 
-def DD_to_nidm(dd_struct):
+def DD_to_nidm(dd_struct,dataset_identifier=None):
     '''
 
     Takes a DD json structure and returns nidm CDE-style graph to be added to NIDM documents
@@ -1735,7 +1739,7 @@ def DD_to_nidm(dd_struct):
                 # md5hash = hashlib.md5(str(key).encode()).hexdigest()
 
 
-                cde_id = DD_UUID(key,dd_struct)
+                cde_id = DD_UUID(key,dd_struct,dataset_identifier)
                 #cde_id = URIRef(niiri_ns + safe_string(item) + "_" + str(crc32hash))
                 g.add((cde_id,RDF.type, Constants.NIDM['PersonalDataElement']))
                 g.add((cde_id,RDF.type, Constants.PROV['Entity']))
@@ -1757,7 +1761,7 @@ def DD_to_nidm(dd_struct):
             elif (key == 'levels') or (key == 'Levels'):
                 g.add((cde_id,Constants.NIDM['levels'],Literal(value)))
             elif key == 'source_variable':
-                g.add((cde_id, Constants.NIDM['source_variable'], Literal(value)))
+                g.add((cde_id, Constants.NIDM['sourceVariable'], Literal(value)))
             elif key == 'isAbout':
                 #dct_ns = Namespace(Constants.DCT)
                 #g.bind(prefix='dct', namespace=dct_ns)
@@ -1802,7 +1806,7 @@ def DD_to_nidm(dd_struct):
             elif (key == 'maxValue') or (key == 'maximumValue'):
                 g.add((cde_id, Constants.NIDM['maxValue'], Literal(value)))
             elif key == 'hasUnit':
-                g.add((cde_id, Constants.NIDM['hasUnit'], Literal(value)))
+                g.add((cde_id, Constants.NIDM['unitCode'], Literal(value)))
             elif key == 'sameAs':
                 g.add((cde_id, Constants.NIDM['sameAs'], URIRef(value)))
             elif key == 'associatedWith':
@@ -1820,7 +1824,7 @@ def add_attributes_with_cde(prov_object, cde, row_variable, value):
 
     # find the ID in cdes where nidm:source_variable matches the row_variable
     # qres = cde.subjects(predicate=Constants.RDFS['label'],object=Literal(row_variable))
-    qres = cde.subjects(predicate=Constants.NIDM['source_variable'],object=Literal(row_variable))
+    qres = cde.subjects(predicate=Constants.NIDM['sourceVariable'],object=Literal(row_variable))
     for s in qres:
         entity_id = s
         # find prefix matching our url in rdflib graph...this is because we're bouncing between

diff --git a/nidm/experiment/tools/bidsmri2nidm.py b/nidm/experiment/tools/bidsmri2nidm.py
@@ -575,6 +575,18 @@ def bidsmri2project(directory, args):
             else:
                 project.add_attributes({BIDS_Constants.dataset_description[key]:dataset[key]})
 
+            # added special case to include DOI of project in hash for data element UUIDs to prevent collisions with
+            # similar data elements from other projects and make the bids2nidm conversion deterministic in the sense
+            # that if you re-convert the same dataset to NIDM, the data element UUIDs will remain the same.
+            if key == "DatasetDOI":
+                if dataset[key] == "":
+                    dataset_doi = None
+                else:
+                    dataset_doi = dataset[key]
+            else:
+                dataset_doi = None
+
+
 
 
 
@@ -615,28 +627,34 @@ def bidsmri2project(directory, args):
                     temp=DataFrame(columns=mapping_list)
                     if args.no_concepts:
                         column_to_terms,cde = map_variables_to_terms(directory=directory,assessment_name='participants.tsv',
-                            df=temp,output_file=os.path.join(directory,'participants.json'),bids=True,associate_concepts=False)
+                            df=temp,output_file=os.path.join(directory,'participants.json'),bids=True,associate_concepts=False,
+                            dataset_identifier = dataset_doi)
                     else:
                         column_to_terms,cde = map_variables_to_terms(directory=directory,assessment_name='participants.tsv',
-                            df=temp,output_file=os.path.join(directory,'participants.json'),bids=True)
+                            df=temp,output_file=os.path.join(directory,'participants.json'),bids=True,
+                            dataset_identifier = dataset_doi)
                 else:
                     #maps variables in CSV file to terms
                     temp=DataFrame(columns=mapping_list)
                     if args.no_concepts:
                         column_to_terms,cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp,
-                            output_file=os.path.join(directory,'participants.json'),json_source=os.path.join(directory,'participants.json'),bids=True,associate_concepts=False)
+                            output_file=os.path.join(directory,'participants.json'),json_source=os.path.join(directory,'participants.json'),
+                            bids=True,associate_concepts=False, dataset_identifier = dataset_doi)
                     else:
                         column_to_terms,cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp,
-                            output_file=os.path.join(directory,'participants.json'),json_source=os.path.join(directory,'participants.json'),bids=True)
+                            output_file=os.path.join(directory,'participants.json'),json_source=os.path.join(directory,'participants.json'),
+                            bids=True,dataset_identifier = dataset_doi)
             else:
                 #maps variables in CSV file to terms
                 temp=DataFrame(columns=mapping_list)
                 if args.no_concepts:
                     column_to_terms, cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp,
-                        output_file=os.path.join(directory,'participants.json'),json_source=args.json_map,bids=True,associate_concepts=False)
+                        output_file=os.path.join(directory,'participants.json'),json_source=args.json_map,bids=True,
+                        associate_concepts=False, dataset_identifier = dataset_doi)
                 else:
                     column_to_terms, cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp,
-                        output_file=os.path.join(directory,'participants.json'),json_source=args.json_map,bids=True)
+                        output_file=os.path.join(directory,'participants.json'),json_source=args.json_map,bids=True,
+                        dataset_identifier = dataset_doi)
 
 
             for row in participants_data:

diff --git a/nidm/experiment/tools/click_main.py b/nidm/experiment/tools/click_main.py
@@ -5,3 +5,4 @@
 from nidm.experiment.tools import nidm_concat
 from nidm.experiment.tools import nidm_merge
 from nidm.experiment.tools import nidm_convert
+from nidm.experiment.tools import nidm_linreg
diff --git a/nidm/experiment/tools/csv2nidm.py b/nidm/experiment/tools/csv2nidm.py
@@ -92,6 +92,9 @@ def main(argv):
                                 'asked of the user.  This is useful if you already have a -json_map specified without concepts and want to'
                                 'simply run this program to get a NIDM file with user interaction to associate concepts.')
     parser.add_argument('-log','--log', dest='logfile',required=False, default=None, help="full path to directory to save log file. Log file name is csv2nidm_[arg.csv_file].log")
+    parser.add_argument('-dataset_id', '--dataset_id', dest='dataset_identifier',required=False, default=None,
+                        help='If this is provided, which can be any dataset ID although its suggested to use a dataset'
+                             'DOI if available, unique data element IDs will use this information as part of the hash.')
     parser.add_argument('-out', dest='output_file', required=True, help="Full path with filename to save NIDM file")
     args = parser.parse_args()
 
@@ -120,12 +123,15 @@ def main(argv):
     #else:
     # if user did not specify -no_concepts then associate concepts interactively with user
     if not args.no_concepts:
-        column_to_terms, cde = map_variables_to_terms(df=df,  assessment_name=basename(args.csv_file),directory=dirname(args.output_file), output_file=args.output_file, json_source=json_map)
+        column_to_terms, cde = map_variables_to_terms(df=df,  assessment_name=basename(args.csv_file),
+                                                    directory=dirname(args.output_file), output_file=args.output_file,
+                                                      json_source=json_map,dataset_identifier=args.dataset_identifier)
     # run without concept mappings
     else:
         column_to_terms, cde = map_variables_to_terms(df=df, assessment_name=basename(args.csv_file),
                                                       directory=dirname(args.output_file), output_file=args.output_file,
-                                                      json_source=json_map, associate_concepts=False)
+                                                      json_source=json_map, associate_concepts=False,
+                                                      dataset_identifier=args.dataset_identifier)
 
     if args.logfile is not None:
         logging.basicConfig(filename=join(args.logfile,'csv2nidm_' + os.path.splitext(os.path.basename(args.csv_file))[0] + '.log'), level=logging.DEBUG)

diff --git a/nidm/experiment/tools/nidm_linreg.py b/nidm/experiment/tools/nidm_linreg.py
@@ -68,7 +68,7 @@
               help="Optional output file (TXT) to store results of the linear regression, contrast, and regularization")
 @click.option("--regularization", "-r", required=False,
               help="This parameter will return the results of the linear regression with L1 or L2 regularization depending on the type specified, and the weight with the maximum likelihood solution")
-def full_regression(nidm_file_list, output_file, model, contrast, regularization):
+def linreg(nidm_file_list, output_file, model, contrast, regularization):
     #NOTE: Every time I make a global variable, it is because I need it in at least one other method.
     global c #used in linreg(), contrasting()
     c = contrast #Storing all important parameters in global variables so they can be accessed in other methods
@@ -645,4 +645,4 @@ def opencsv(data):
 
 # it can be used calling the script `python nidm_query.py -nl ... -q ..
 if __name__ == "__main__":
-    full_regression()
+    linreg()
diff --git a/nidm/version.py b/nidm/version.py
@@ -4,7 +4,7 @@
 # Format expected by setup.py and doc/source/conf.py: string of form "X.Y.Z"
 _version_major = 3
 _version_minor = 8 
-_version_micro = '1'  # use '' for first of series, number for 1 and above
+_version_micro = '2'  # use '' for first of series, number for 1 and above
 _version_extra = ''
 # _version_extra = ''  # Uncomment this for full releases