diff --git a/NCISlideUtil.py b/NCISlideUtil.py index a61dcfe..07c325e 100644 --- a/NCISlideUtil.py +++ b/NCISlideUtil.py @@ -2,40 +2,68 @@ import subprocess import time from multiprocessing.pool import ThreadPool - +import json import openslide - +import os +import requests from dev_utils import file_md5 from dev_utils import postslide from dev_utils import post_url # GLOBALS (for now) # config = {'thumbnail_size': 100, 'thread_limit': 20} -config = { 'thread_limit': 20} +config = {'thread_limit': 20} manifest_path = 'manifest.csv' # NCI DOE added flat file START +collections_path = 'specialties_list.json' flat_file_path = 'flat_file.csv' -apiKey = 'eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyVHlwZSI6IkFkbWluIiwidXNlckZpbHRlciI6WyJQdWJsaWMiXSwic3ViIjoibGluYW5sZGpAZ21haWwuY29tIiwiZW1haWwiOiJsaW5hbmxkakBnbWFpbC5jb20iLCJuYW1lIjoiTmFuIExpIiwicGljdHVyZSI6Imh0dHBzOi8vbGgzLmdvb2dsZXVzZXJjb250ZW50LmNvbS9hLS9BT2gxNEdpRnBfRlRFYWxGbmlzclcwQzF6NDFPbW1wZS1uTTd0Wkx4SXRNbE9nPXM5Ni1jIiwiaWF0IjoxNjM0ODQ0MjQ5LCJleHAiOjE2MzQ5MzA2NDl9.oRDeM_i1i4fQB3wlVmodAF4NG_umCZL2DIObWYMCviJwWXPAfNDtyMEY2GwMzgeMcQNPjIbDem6mhuDvhyOSmQc0J5lpxJpZYCVKnOQ95Q2rNy1F9gQjpuJ_vfIKRoakH9lE_W3leg8ff-zvUbgpOyzQxEg4louUGGpqG_5FVQnHG88CGAzzG7MvCb6wuyDrRvhxGBIRicjFN_zj8ZeXzmXD7U9KhOgKAW21XWhL4RyBhQyq8CORPx23omRKk7u72oTY5dlfzHj6O9Ll92MqJQEF1Xz08nVLlSMSw7pTKmmGWkK2DUKsp9sRvc2uFXButUpIrvaqh1ukCV6HU0hGIg' + # NCI DOE added flat file END # process expects a single image metadata as dictionary + + def process(img): - try: - img = openslidedata(img) - img['study'] = img.get('study', "") - img['specimen'] = img.get('specimen', "") - img['location'] = img['location'] or img['filename'] - img = postslide(img, post_url, apiKey) - print('process img:') - print(img) - except BaseException as e: - img['_status'] = e - return img + # check slides + sid = None + token_id = img['token_id'] + slide_name = img['name'] + res = requests.get(slide_find_url, params={'name': slide_name}) + if res.status_code == 200: + rs = res.json() + # the slide doesn't exist + if len(rs) < 1: + try: + img = openslidedata(img) + img['study'] = img.get('study', "") + img['specimen'] = img.get('specimen', "") + img['location'] = img['location'] or img['filename'] + img = postslide(img, post_url) + res = requests.get(slide_find_url, params={'name': slide_name}) + sid = res.json()[0]['_id']['$oid'] + print('process img:') + print(img) + except BaseException as e: + img['_status'] = e + + else: + sid = res.json()[0]['_id']['$oid'] + print(sid) + img['_status'] = 'existed' + # add slide to collection + cid = subspecialties_map.get(token_id.lower()) + if sid is not None or cid is not None: + res = requests.post(add_slide_to_collection_url, data=json.dumps({'cid': cid, 'sids': [sid]}), headers={ + 'content-type': 'application/json'}) + return img + else: + img['_status'] = res.status_code + return img def gen_thumbnail(filename, slide, size, imgtype="png"): dest = filename + "." + imgtype - + slide.get_thumbnail([size, size]).save(dest, imgtype.upper()) @@ -45,14 +73,16 @@ def openslidedata(metadata): metadata['mpp-x'] = slideData.get(openslide.PROPERTY_NAME_MPP_X, None) metadata['mpp-x'] = slideData.get(openslide.PROPERTY_NAME_MPP_Y, None) metadata['mpp'] = metadata['mpp-x'] or metadata['mpp-x'] or None - metadata['height'] = slideData.get(openslide.PROPERTY_NAME_BOUNDS_HEIGHT, None) - metadata['width'] = slideData.get(openslide.PROPERTY_NAME_BOUNDS_WIDTH, None) + metadata['height'] = slideData.get( + openslide.PROPERTY_NAME_BOUNDS_HEIGHT, None) + metadata['width'] = slideData.get( + openslide.PROPERTY_NAME_BOUNDS_WIDTH, None) metadata['vendor'] = slideData.get(openslide.PROPERTY_NAME_VENDOR, None) metadata['comment'] = slideData.get(openslide.PROPERTY_NAME_COMMENT, None) metadata['level_count'] = int(slideData.get('level_count', 1)) metadata['objective'] = float(slideData.get("aperio.AppMag", 0.0)) metadata['md5sum'] = file_md5(metadata['location']) - + # NCI DOE metadata START if metadata['height'] is None: metadata['height'] = slideData.get('aperio.OriginalHeight', None) @@ -62,18 +92,20 @@ def openslidedata(metadata): metadata['width'] = slideData.get('aperio.OriginalWidth', None) if metadata['width'] is None: metadata['width'] = slideData.get('openslide.level[0].width', None) - metadata['token_id'] = slideData.get('aperio.CustomField.TokenID', None) + metadata['token_id'] = slideData.get( + 'aperio.CustomField.TokenID', metadata['token_id']) metadata['proc_seq'] = slideData.get('aperio.CustomField.Proc_Seq', None) metadata['spec_site'] = slideData.get('aperio.CustomField.Spec_Site', None) - metadata['image_id'] = slideData.get('aperio.CustomField.ImageID', None) + metadata['image_id'] = slideData.get('aperio.CustomField.ImageID', None) flat_matedata = flat_map[metadata['token_id'].lower()] - metadata['registry_code'] = flat_matedata.get('registry',None) - metadata['primary_tumor_site_code'] = flat_matedata.get('primary_site',None) - metadata['primary_tumor_site_term'] = flat_matedata.get('site_text',None) - metadata['morphology_code'] = flat_matedata.get('histology_icdo3',None) - metadata['seer_coded_histology'] = flat_matedata.get('hist_text',None) - metadata['behavior_code'] = flat_matedata.get('behavior_icdo3',None) + metadata['registry_code'] = flat_matedata.get('registry', None) + metadata['primary_tumor_site_code'] = flat_matedata.get( + 'primary_site', None) + metadata['primary_tumor_site_term'] = flat_matedata.get('site_text', None) + metadata['morphology_code'] = flat_matedata.get('histology_icdo3', None) + metadata['seer_coded_histology'] = flat_matedata.get('hist_text', None) + metadata['behavior_code'] = flat_matedata.get('behavior_icdo3', None) metadata['timestamp'] = time.time() # NCI DOE metadata END @@ -85,11 +117,53 @@ def openslidedata(metadata): # NCI DOE create a metadata dict START flat_map = {} +subspecialties_map = {} +slide_find_url = 'http://ca-back:4010/data/Slide/find' +slide_post_url = 'http://ca-back:4010/data/Slide/post' +collection_find_url = 'http://ca-back:4010/data/Collection/find' +collection_post_url = 'http://ca-back:4010/data/Collection/post' +add_slide_to_collection_url = 'http://ca-back:4010/data/Collection/addSlidesToCollection' + + +def addSpecialty(data): + # check specialty exists + res = requests.get(collection_find_url, params=data) + if res.status_code == 200: + rs = res.json() + # return collection id if exist + if len(rs) > 0: + return rs[0]['_id']['$oid'] + # add the new one and return collection id if not exist + else: + res = requests.post(collection_post_url, data=json.dumps(data), headers={ + 'content-type': 'application/json'}) + return res.json()['ops'][0]['_id'] + else: + return None + + +# read the specialty list +if os.path.exists(collections_path): + with open(collections_path, 'r', encoding='utf-8-sig') as j: + collections = json.load(j) + for collection in collections: + # add specialty + pid = addSpecialty({'text': collection['specialty']}) + for sub in collection['subspecialties']: + # add specialty + cid = addSpecialty( + {'text': sub, 'pid': pid}) + # save the token id and collection id as map + if cid is not None: + subspecialties_map[sub.lower()] = cid + + # get flat file and create dict as map [tokenId, data] with open(flat_file_path, 'r', encoding='utf-8-sig') as f: reader = csv.DictReader(f) for row in reader: flat_map[row['tokenid'].lower()] = row + # NCI DOE create a metadata dict END # get manifest diff --git a/SlideUtil.py b/SlideUtil.py index 91dfb55..2017f8b 100644 --- a/SlideUtil.py +++ b/SlideUtil.py @@ -12,7 +12,7 @@ # GLOBALS (for now) config = {'thumbnail_size': 100, 'thread_limit': 20} manifest_path = 'manifest.csv' - +apiKey = '' # process expects a single image metadata as dictionary def process(img): @@ -21,7 +21,7 @@ def process(img): img['study'] = img.get('study', "") img['specimen'] = img.get('specimen', "") img['location'] = img['location'] or img['filename'] - img = postslide(img, post_url) + img = postslide(img, post_url, apiKey) except BaseException as e: img['_status'] = e return img diff --git a/dev_utils.py b/dev_utils.py index 9ea6de7..f344ac0 100644 --- a/dev_utils.py +++ b/dev_utils.py @@ -47,7 +47,8 @@ def getMetadata(filename, upload_folder, extended): def postslide(img, url, token=''): - url = url + '?token='+ token + if token != '': + url = url + '?token='+token payload = json.dumps(img) res = requests.post(url, data=payload, headers={'content-type': 'application/json'}) if res.status_code < 300: diff --git a/make_thumbs.py b/make_thumbs.py new file mode 100644 index 0000000..22523cc --- /dev/null +++ b/make_thumbs.py @@ -0,0 +1,50 @@ +import requests +import openslide +import pycurl +from multiprocessing.pool import ThreadPool + +SLIDE_LIST_URL = "http://ca-back:4010/data/Slide/find" +IIP_BASE = "http://ca-back:4010/img/IIP/raw/?FIF=" +UPDATE_URL = "http://ca-back:4010/data/Slide/update" +# TODO -- token input? +IM_SIZE = 256 +THREADS = 5 +REGNERATE = False +SAVE_DIR = "/images/thumbnails/" + +def setThumb(id, val): + requests.post(UPDATE_URL + "?_id=" + id, json={'thumbnail': val}) + +def gen_thumbnail(filename, slide, size, imgtype="png"): + dest = SAVE_DIR + filename + "." + imgtype + print(dest) + slide.get_thumbnail([size, size]).save(dest, imgtype.upper()) + +def process(record): + file = record["location"] + name = record["name"] + # skip ones which already have a thumbnail, unless otherwise specified + if REGNERATE or not record.get("thumbnail", False): + try: + slide = openslide.OpenSlide(file) + gen_thumbnail(name, slide, IM_SIZE, imgtype="png") + setThumb(record['_id']["$oid"], name+".png") + return "" + except BaseException as e: + try: + url = IIP_BASE + file + "&WID=200&CVT=png" + c = pycurl.Curl() + c.setopt(c.URL, url) + with open(SAVE_DIR+name+".png", "wb") as f: + c.setopt(c.WRITEFUNCTION, f.write) + c.perform() + setThumb(record['_id']["$oid"], name+".png") + except BaseException as y: + return [name, y] + +# do it +manifest = requests.get(SLIDE_LIST_URL).json() +print(manifest[0]) + +res = ThreadPool(THREADS).imap_unordered(process, manifest) +print([x for x in filter(None,[r for r in res])]) diff --git a/requirements.txt b/requirements.txt index 488377f..7e6e95e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ numpy Pillow google-api-python-client google-auth-httplib2 -google-auth-oauthlib \ No newline at end of file +google-auth-oauthlib +pycurl