Skip to content

Commit

Permalink
Merge pull request #277 from dbkeator/master
Browse files Browse the repository at this point in the history
updated the way data element UUIDs are created, now using dataset DOI…
  • Loading branch information
dbkeator authored Jul 20, 2021
2 parents 46d6307 + ecd4a09 commit 4e5f389
Show file tree
Hide file tree
Showing 8 changed files with 63 additions and 24 deletions.
6 changes: 5 additions & 1 deletion nidm/core/BIDS_Constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
"Procedure" : Constants.NIDM_PROJECT_DESCRIPTION,
"License" : Constants.NIDM_PROJECT_LICENSE,
"ReferencesAndLinks" : Constants.NIDM_PROJECT_REFERENCES,
"Authors" : Constants.NIDM_AUTHOR
"Authors" : Constants.NIDM_AUTHOR,
"DatasetDOI" : Constants.NIDM_DOI,
"Funding" : Constants.NIDM_FUNDING,
"HowToAcknowledge" : Constants.NIDM_ACKNOWLEDGEMENTS

}

#BIDS Participants file -> NIDM constants mappings
Expand Down
12 changes: 9 additions & 3 deletions nidm/core/Constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
ONLI = Namespace("http://neurolog.unice.fr/ontoneurolog/v3.0/instrument.owl#")
PATO = Namespace("http://purl.obolibrary.org/obo/pato#")
DATALAD = Namespace("http://datasets.datalad.org/")
INTERLEX = Namespace("http://uri.interlex.org/base/")
INTERLEX = Namespace("http://uri.interlex.org/")
EDAM = Namespace("https://bioportal.bioontology.org/ontologies/EDAM")

namespaces = {
# "prov": PROV,
Expand Down Expand Up @@ -95,7 +96,8 @@
"onli" : ONLI,
"pato" : PATO,
"datalad" : DATALAD,
"ilx" : INTERLEX
"ilx" : INTERLEX,
"edam" : EDAM
}

# Empty graph used to compute qnames
Expand Down Expand Up @@ -386,6 +388,9 @@ def __init__(self, namespaces=None):
NIDM_MRI_ASL = QualifiedName(provNamespace("nidm",NIDM),"ArterialSpinLabeling")
CRYPTO_SHA512 =QualifiedName(provNamespace("crypto", CRYPTO),"sha512")
DATALAD_LOCATION = QualifiedName(provNamespace("datalad", DATALAD),"Location")
NIDM_DOI = QualifiedName(provNamespace("edam",EDAM),"data_1188")
NIDM_FUNDING = QualifiedName(provNamespace("obo",OBO),"IAO_0000623")
NIDM_ACKNOWLEDGEMENTS = QualifiedName(provNamespace("obo",OBO),"IAO_0000324")
##############################################################################
# OBO constants
OBO_EXAMPLE = OBO['IAO_0000112']
Expand Down Expand Up @@ -609,7 +614,8 @@ def __init__(self, namespaces=None):
NIDM_MRI_T2_STAR,
NIDM_MRI_DIFFUSION_TENSOR,
NIDM_MRI_FLOW,
NIDM_MRI_BOLD_EVENTS]
NIDM_MRI_BOLD_EVENTS,
NIDM_DOI]

# Common isAbout URIs
NIDM_IS_ABOUT_AGE = str(INTERLEX['ilx_0100400'])
Expand Down
22 changes: 13 additions & 9 deletions nidm/experiment/Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,7 +955,7 @@ def redcap_datadictionary_to_json(redcap_dd_file,assessment_name):


def map_variables_to_terms(df,directory, assessment_name, output_file=None,json_source=None,bids=False,owl_file='nidm',
associate_concepts=True):
associate_concepts=True, dataset_identifier=None):
'''
:param df: data frame with first row containing variable names
Expand Down Expand Up @@ -1253,7 +1253,7 @@ def map_variables_to_terms(df,directory, assessment_name, output_file=None,json_
write_json_mapping_file(column_to_terms, output_file, bids)

# get CDEs for data dictonary and NIDM graph entity of data
cde = DD_to_nidm(column_to_terms)
cde = DD_to_nidm(column_to_terms,dataset_identifier=dataset_identifier)

return [column_to_terms, cde]

Expand Down Expand Up @@ -1652,7 +1652,7 @@ def annotate_data_element(source_variable, current_tuple, source_variable_annota
print("levels: %s" % source_variable_annotations[current_tuple]['levels'])
print("---------------------------------------------------------------------------------------")

def DD_UUID (element,dd_struct):
def DD_UUID (element,dd_struct,dataset_identifier=None):
'''
This function will produce a hash of the data dictionary (personal data element) properties defined
by the user for use as a UUID. The data dictionary key is a tuple identifying the file and variable
Expand All @@ -1669,7 +1669,11 @@ def DD_UUID (element,dd_struct):
# added getUUID to property string to solve problem where all openneuro datasets that have the same
# source variable name and properties don't end up having the same UUID as they are sometimes not
# the same and end up being added to the same entity when merging graphs across all openneuro projects
property_string=getUUID()
# if a dataset identifier is not provided then we use a random UUID
if dataset_identifier is not None:
property_string = dataset_identifier
else:
property_string = getUUID()
for key, value in dd_struct[str(key_tuple)].items():
if key == 'label':
property_string = property_string + str(value)
Expand All @@ -1688,7 +1692,7 @@ def DD_UUID (element,dd_struct):
cde_id = URIRef(niiri_ns + safe_string(variable_name) + "_" + str(crc32hash))
return cde_id

def DD_to_nidm(dd_struct):
def DD_to_nidm(dd_struct,dataset_identifier=None):
'''
Takes a DD json structure and returns nidm CDE-style graph to be added to NIDM documents
Expand Down Expand Up @@ -1735,7 +1739,7 @@ def DD_to_nidm(dd_struct):
# md5hash = hashlib.md5(str(key).encode()).hexdigest()


cde_id = DD_UUID(key,dd_struct)
cde_id = DD_UUID(key,dd_struct,dataset_identifier)
#cde_id = URIRef(niiri_ns + safe_string(item) + "_" + str(crc32hash))
g.add((cde_id,RDF.type, Constants.NIDM['PersonalDataElement']))
g.add((cde_id,RDF.type, Constants.PROV['Entity']))
Expand All @@ -1757,7 +1761,7 @@ def DD_to_nidm(dd_struct):
elif (key == 'levels') or (key == 'Levels'):
g.add((cde_id,Constants.NIDM['levels'],Literal(value)))
elif key == 'source_variable':
g.add((cde_id, Constants.NIDM['source_variable'], Literal(value)))
g.add((cde_id, Constants.NIDM['sourceVariable'], Literal(value)))
elif key == 'isAbout':
#dct_ns = Namespace(Constants.DCT)
#g.bind(prefix='dct', namespace=dct_ns)
Expand Down Expand Up @@ -1802,7 +1806,7 @@ def DD_to_nidm(dd_struct):
elif (key == 'maxValue') or (key == 'maximumValue'):
g.add((cde_id, Constants.NIDM['maxValue'], Literal(value)))
elif key == 'hasUnit':
g.add((cde_id, Constants.NIDM['hasUnit'], Literal(value)))
g.add((cde_id, Constants.NIDM['unitCode'], Literal(value)))
elif key == 'sameAs':
g.add((cde_id, Constants.NIDM['sameAs'], URIRef(value)))
elif key == 'associatedWith':
Expand All @@ -1820,7 +1824,7 @@ def add_attributes_with_cde(prov_object, cde, row_variable, value):

# find the ID in cdes where nidm:source_variable matches the row_variable
# qres = cde.subjects(predicate=Constants.RDFS['label'],object=Literal(row_variable))
qres = cde.subjects(predicate=Constants.NIDM['source_variable'],object=Literal(row_variable))
qres = cde.subjects(predicate=Constants.NIDM['sourceVariable'],object=Literal(row_variable))
for s in qres:
entity_id = s
# find prefix matching our url in rdflib graph...this is because we're bouncing between
Expand Down
30 changes: 24 additions & 6 deletions nidm/experiment/tools/bidsmri2nidm.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,18 @@ def bidsmri2project(directory, args):
else:
project.add_attributes({BIDS_Constants.dataset_description[key]:dataset[key]})

# added special case to include DOI of project in hash for data element UUIDs to prevent collisions with
# similar data elements from other projects and make the bids2nidm conversion deterministic in the sense
# that if you re-convert the same dataset to NIDM, the data element UUIDs will remain the same.
if key == "DatasetDOI":
if dataset[key] == "":
dataset_doi = None
else:
dataset_doi = dataset[key]
else:
dataset_doi = None





Expand Down Expand Up @@ -615,28 +627,34 @@ def bidsmri2project(directory, args):
temp=DataFrame(columns=mapping_list)
if args.no_concepts:
column_to_terms,cde = map_variables_to_terms(directory=directory,assessment_name='participants.tsv',
df=temp,output_file=os.path.join(directory,'participants.json'),bids=True,associate_concepts=False)
df=temp,output_file=os.path.join(directory,'participants.json'),bids=True,associate_concepts=False,
dataset_identifier = dataset_doi)
else:
column_to_terms,cde = map_variables_to_terms(directory=directory,assessment_name='participants.tsv',
df=temp,output_file=os.path.join(directory,'participants.json'),bids=True)
df=temp,output_file=os.path.join(directory,'participants.json'),bids=True,
dataset_identifier = dataset_doi)
else:
#maps variables in CSV file to terms
temp=DataFrame(columns=mapping_list)
if args.no_concepts:
column_to_terms,cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp,
output_file=os.path.join(directory,'participants.json'),json_source=os.path.join(directory,'participants.json'),bids=True,associate_concepts=False)
output_file=os.path.join(directory,'participants.json'),json_source=os.path.join(directory,'participants.json'),
bids=True,associate_concepts=False, dataset_identifier = dataset_doi)
else:
column_to_terms,cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp,
output_file=os.path.join(directory,'participants.json'),json_source=os.path.join(directory,'participants.json'),bids=True)
output_file=os.path.join(directory,'participants.json'),json_source=os.path.join(directory,'participants.json'),
bids=True,dataset_identifier = dataset_doi)
else:
#maps variables in CSV file to terms
temp=DataFrame(columns=mapping_list)
if args.no_concepts:
column_to_terms, cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp,
output_file=os.path.join(directory,'participants.json'),json_source=args.json_map,bids=True,associate_concepts=False)
output_file=os.path.join(directory,'participants.json'),json_source=args.json_map,bids=True,
associate_concepts=False, dataset_identifier = dataset_doi)
else:
column_to_terms, cde = map_variables_to_terms(directory=directory, assessment_name='participants.tsv', df=temp,
output_file=os.path.join(directory,'participants.json'),json_source=args.json_map,bids=True)
output_file=os.path.join(directory,'participants.json'),json_source=args.json_map,bids=True,
dataset_identifier = dataset_doi)


for row in participants_data:
Expand Down
1 change: 1 addition & 0 deletions nidm/experiment/tools/click_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from nidm.experiment.tools import nidm_concat
from nidm.experiment.tools import nidm_merge
from nidm.experiment.tools import nidm_convert
from nidm.experiment.tools import nidm_linreg
10 changes: 8 additions & 2 deletions nidm/experiment/tools/csv2nidm.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ def main(argv):
'asked of the user. This is useful if you already have a -json_map specified without concepts and want to'
'simply run this program to get a NIDM file with user interaction to associate concepts.')
parser.add_argument('-log','--log', dest='logfile',required=False, default=None, help="full path to directory to save log file. Log file name is csv2nidm_[arg.csv_file].log")
parser.add_argument('-dataset_id', '--dataset_id', dest='dataset_identifier',required=False, default=None,
help='If this is provided, which can be any dataset ID although its suggested to use a dataset'
'DOI if available, unique data element IDs will use this information as part of the hash.')
parser.add_argument('-out', dest='output_file', required=True, help="Full path with filename to save NIDM file")
args = parser.parse_args()

Expand Down Expand Up @@ -120,12 +123,15 @@ def main(argv):
#else:
# if user did not specify -no_concepts then associate concepts interactively with user
if not args.no_concepts:
column_to_terms, cde = map_variables_to_terms(df=df, assessment_name=basename(args.csv_file),directory=dirname(args.output_file), output_file=args.output_file, json_source=json_map)
column_to_terms, cde = map_variables_to_terms(df=df, assessment_name=basename(args.csv_file),
directory=dirname(args.output_file), output_file=args.output_file,
json_source=json_map,dataset_identifier=args.dataset_identifier)
# run without concept mappings
else:
column_to_terms, cde = map_variables_to_terms(df=df, assessment_name=basename(args.csv_file),
directory=dirname(args.output_file), output_file=args.output_file,
json_source=json_map, associate_concepts=False)
json_source=json_map, associate_concepts=False,
dataset_identifier=args.dataset_identifier)

if args.logfile is not None:
logging.basicConfig(filename=join(args.logfile,'csv2nidm_' + os.path.splitext(os.path.basename(args.csv_file))[0] + '.log'), level=logging.DEBUG)
Expand Down
4 changes: 2 additions & 2 deletions nidm/experiment/tools/nidm_linreg.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
help="Optional output file (TXT) to store results of the linear regression, contrast, and regularization")
@click.option("--regularization", "-r", required=False,
help="This parameter will return the results of the linear regression with L1 or L2 regularization depending on the type specified, and the weight with the maximum likelihood solution")
def full_regression(nidm_file_list, output_file, model, contrast, regularization):
def linreg(nidm_file_list, output_file, model, contrast, regularization):
#NOTE: Every time I make a global variable, it is because I need it in at least one other method.
global c #used in linreg(), contrasting()
c = contrast #Storing all important parameters in global variables so they can be accessed in other methods
Expand Down Expand Up @@ -645,4 +645,4 @@ def opencsv(data):

# it can be used calling the script `python nidm_query.py -nl ... -q ..
if __name__ == "__main__":
full_regression()
linreg()
2 changes: 1 addition & 1 deletion nidm/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Format expected by setup.py and doc/source/conf.py: string of form "X.Y.Z"
_version_major = 3
_version_minor = 8
_version_micro = '1' # use '' for first of series, number for 1 and above
_version_micro = '2' # use '' for first of series, number for 1 and above
_version_extra = ''
# _version_extra = '' # Uncomment this for full releases

Expand Down

0 comments on commit 4e5f389

Please sign in to comment.