From ccb45dce9578afb653dc681ff0d0fcee29990013 Mon Sep 17 00:00:00 2001 From: Aukerman Date: Tue, 5 Apr 2022 12:34:01 -0400 Subject: [PATCH 01/18] Make other tools dataset enabled --- .../luna/common/cli/post_to_dataset.py | 8 +++-- .../cli/extract_kfunction_statistics.py | 2 +- .../pathology/cli/extract_tile_statistics.py | 8 ++--- .../cli/run_stardist_cell_detection.py | 33 ++++++++++--------- .../luna/pathology/cli/save_tiles.py | 5 +-- 5 files changed, 30 insertions(+), 26 deletions(-) diff --git a/pyluna-common/luna/common/cli/post_to_dataset.py b/pyluna-common/luna/common/cli/post_to_dataset.py index 914eaa5d..1c887d84 100644 --- a/pyluna-common/luna/common/cli/post_to_dataset.py +++ b/pyluna-common/luna/common/cli/post_to_dataset.py @@ -30,9 +30,9 @@ def cli(**cli_kwargs): output data \b Example: - CLI_TOOL ./slides/10001.svs ./halo/10001.job18484.annotations - -an Tumor - -o ./masks/10001/ + post_to_dataset /path/to/featuredata + --waystation_url Tumor + --dataset_id MY_DATASET """ cli_runner( cli_kwargs, _params_, post_to_dataset, pass_keys=True) @@ -61,6 +61,8 @@ def post_to_dataset(input_feature_data, waystation_url, dataset_id, keys): res = requests.post(post_url, files={'segment_data': open (input_feature_data, 'rb')}, data={"segment_keys": json.dumps(keys)}) + print (res.text) + logger.info (f"Response: {res}, Response data: {res.json()}") return {} diff --git a/pyluna-pathology/luna/pathology/cli/extract_kfunction_statistics.py b/pyluna-pathology/luna/pathology/cli/extract_kfunction_statistics.py index 13d9f622..e3e2ed2f 100644 --- a/pyluna-pathology/luna/pathology/cli/extract_kfunction_statistics.py +++ b/pyluna-pathology/luna/pathology/cli/extract_kfunction_statistics.py @@ -73,7 +73,7 @@ def extract_kfunction(input_cell_objects, tile_size, intensity_label, tile_strid Returns: dict: metadata about function call """ - df = pd.read_csv(input_cell_objects) + df = pd.read_parquet(input_cell_objects) l_address = [] l_k_function = [] diff --git a/pyluna-pathology/luna/pathology/cli/extract_tile_statistics.py b/pyluna-pathology/luna/pathology/cli/extract_tile_statistics.py index ab8cb748..850491fe 100644 --- a/pyluna-pathology/luna/pathology/cli/extract_tile_statistics.py +++ b/pyluna-pathology/luna/pathology/cli/extract_tile_statistics.py @@ -61,7 +61,7 @@ def extract_tile_statistics(input_slide_tiles, output_dir): df = ( pd.read_csv(input_slide_tiles) .set_index("address") - .drop(columns=["x_coord", "y_coord", "tile_size"]) + .drop(columns=["x_coord", "y_coord", "tile_size", 'xy_extent', 'tile_units']) ) print(df.columns) @@ -75,14 +75,14 @@ def extract_tile_statistics(input_slide_tiles, output_dir): df_feature_data = pd.DataFrame([dict_feature_data]) output_feature_file = os.path.join( - output_dir, Path(input_slide_tiles).stem + "_tile_stats.csv" + output_dir, Path(input_slide_tiles).stem + "_tile_stats.parquet" ) logger.info(df_feature_data) - df_feature_data.to_csv(output_feature_file) + df_feature_data.to_parquet(output_feature_file) - properties = {"feature_csv": output_feature_file} + properties = {"feature_data": output_feature_file} return properties diff --git a/pyluna-pathology/luna/pathology/cli/run_stardist_cell_detection.py b/pyluna-pathology/luna/pathology/cli/run_stardist_cell_detection.py index 1b540a43..586b87b3 100644 --- a/pyluna-pathology/luna/pathology/cli/run_stardist_cell_detection.py +++ b/pyluna-pathology/luna/pathology/cli/run_stardist_cell_detection.py @@ -107,20 +107,20 @@ def run_stardist_cell_detection( os.makedirs(output_dir, exist_ok=True) - client = docker.from_env() - container = client.containers.run( - volumes={ - input_slide_image: {"bind": f"/inputs/{slide_filename}", "mode": "ro"}, - output_dir: {"bind": "/output_dir", "mode": "rw"}, - }, - nano_cpus=int(num_cores * 1e9), - image=docker_image, - command=command, - detach=True, - ) - - for line in container.logs(stream=True): - print(line.decode(), end="") + # client = docker.from_env() + # container = client.containers.run( + # volumes={ + # input_slide_image: {"bind": f"/inputs/{slide_filename}", "mode": "ro"}, + # output_dir: {"bind": "/output_dir", "mode": "rw"}, + # }, + # nano_cpus=int(num_cores * 1e9), + # image=docker_image, + # command=command, + # detach=True, + # ) + + # for line in container.logs(stream=True): + # print(line.decode(), end="") stardist_output = os.path.join(output_dir, "cell_detections.tsv") @@ -132,14 +132,15 @@ def run_stardist_cell_detection( columns={"Centroid X µm": "x_coord", "Centroid Y µm": "y_coord"} ) # x,ys follow this convention - output_header_file = os.path.join(output_dir, f"{slide_id}_cell_objects.csv") - df.to_csv(output_header_file) + output_header_file = os.path.join(output_dir, f"{slide_id}_cell_objects.parquet") + df.to_parquet(output_header_file) logger.info("Generated cell data:") logger.info(df) properties = { "cell_objects": output_header_file, + "feature_data": output_header_file, "spatial": True, "total_cells": len(df), "segment_keys": {"slide_id": slide_id}, diff --git a/pyluna-pathology/luna/pathology/cli/save_tiles.py b/pyluna-pathology/luna/pathology/cli/save_tiles.py index ab1d6639..b97cda62 100644 --- a/pyluna-pathology/luna/pathology/cli/save_tiles.py +++ b/pyluna-pathology/luna/pathology/cli/save_tiles.py @@ -87,7 +87,7 @@ def save_tiles(input_slide_image, input_slide_tiles, output_dir, num_cores, batc slide_id = Path(input_slide_image).stem df = pd.read_csv(input_slide_tiles).set_index('address') - output_header_file = f"{output_dir}/{slide_id}.tiles.csv" + output_header_file = f"{output_dir}/{slide_id}.tiles.parquet" output_hdf_file = f"{output_dir}/{slide_id}.tiles.h5" logger.info(f"Now generating tiles with num_cores={num_cores} and batch_size={batch_size}!") @@ -108,10 +108,11 @@ def save_tiles(input_slide_image, input_slide_tiles, output_dir, num_cores, batc df['tile_store'] = output_hdf_file logger.info(df) - df.to_csv(output_header_file) + df.to_parquet(output_header_file) properties = { "slide_tiles": output_header_file, # "Tiles" are the metadata that describe them + "feature_data": output_header_file, # Tiles can act like feature data "total_tiles": len(df), } From 5e234f898d464eb50746c3de33ad2030dd5f5aef Mon Sep 17 00:00:00 2001 From: Aukerman Date: Tue, 5 Apr 2022 13:29:25 -0400 Subject: [PATCH 02/18] add dataset post as part of CLI runner --- .../luna/common/cli/post_to_dataset.py | 71 ----------------- pyluna-common/luna/common/utils.py | 79 ++++++++++++++++++- .../luna/pathology/cli/save_tiles.py | 2 + 3 files changed, 80 insertions(+), 72 deletions(-) delete mode 100644 pyluna-common/luna/common/cli/post_to_dataset.py diff --git a/pyluna-common/luna/common/cli/post_to_dataset.py b/pyluna-common/luna/common/cli/post_to_dataset.py deleted file mode 100644 index 1c887d84..00000000 --- a/pyluna-common/luna/common/cli/post_to_dataset.py +++ /dev/null @@ -1,71 +0,0 @@ -# General imports -import os, json, logging, yaml -import click -import requests - -from luna.common.custom_logger import init_logger - -init_logger() -logger = logging.getLogger('post_to_dataset') ### Add CLI tool name - -from luna.common.utils import cli_runner - -_params_ = [('input_feature_data', str), ('waystation_url', str), ('dataset_id', str)] - -@click.command() -@click.argument('input_feature_data', nargs=1) -### Additional options -@click.option('-w', '--waystation_url', required=False, - help='URL of waystation') -@click.option('-dsid', '--dataset_id', required=False, - help='Dataset identifier (table name)') -def cli(**cli_kwargs): - """ A cli tool - - \b - Inputs: - input: input data - \b - Outputs: - output data - \b - Example: - post_to_dataset /path/to/featuredata - --waystation_url Tumor - --dataset_id MY_DATASET - """ - cli_runner( cli_kwargs, _params_, post_to_dataset, pass_keys=True) - - -### Transform imports -def post_to_dataset(input_feature_data, waystation_url, dataset_id, keys): - """ CLI tool method - - Args: - input_data (str): path to input data - output_dir (str): output/working directory - - Returns: - dict: metadata about function call - """ - - segment_id = "-".join( - [v for _, v in sorted(keys.items())] - ) - - logger.info (f"Input feature data: {input_feature_data}, {keys}") - - post_url = os.path.join ( waystation_url, "datasets", dataset_id, "segments", segment_id ) - - logger.info (f"Posting to: {post_url}") - - res = requests.post(post_url, files={'segment_data': open (input_feature_data, 'rb')}, data={"segment_keys": json.dumps(keys)}) - - print (res.text) - - logger.info (f"Response: {res}, Response data: {res.json()}") - - return {} - -if __name__ == "__main__": - cli() diff --git a/pyluna-common/luna/common/utils.py b/pyluna-common/luna/common/utils.py index 4483fbed..d268a479 100644 --- a/pyluna-common/luna/common/utils.py +++ b/pyluna-common/luna/common/utils.py @@ -10,9 +10,16 @@ from luna.common.CodeTimer import CodeTimer import itertools +import shutil + import pandas as pd from pathlib import Path +import requests +from functools import partial +import urllib + + logger = logging.getLogger(__name__) # Distinct types that are actually the same (effectively) @@ -344,7 +351,61 @@ def expand_inputs(given_params: dict): return d_params, d_keys -from functools import partial + +def get_dataset_url(): + dataset_url = os.environ.get("DATASET_URL", None) + + if dataset_url is None: + logger.warning("Requesting feature data be sent to dataset, however no dataset URL provided, please set env DATASET_URL!") + else: + logger.info(f"Found dataset URL = {dataset_url}") + + return dataset_url + + + +def post_to_dataset(input_feature_data, waystation_url, dataset_id, keys): + """ CLI tool method + + Args: + input_data (str): path to input data + output_dir (str): output/working directory + + Returns: + dict: metadata about function call + """ + + logger.info(f"Adding {input_feature_data} to {dataset_id} via {waystation_url}") + + segment_id = "-".join( + [v for _, v in sorted(keys.items())] + ) + + logger.info(f"SEGMENT_ID={segment_id}") + + post_url = os.path.join ( waystation_url, "datasets", dataset_id, "segments", segment_id ) + + if 'http' in post_url: + # The cool way, using luna waystation + + logger.info (f"Posting to: {post_url}") + + res = requests.post(post_url, files={'segment_data': open (input_feature_data, 'rb')}, data={"segment_keys": json.dumps(keys)}) + + logger.info (f"{res}: {res.text}") + + elif 'file:/' in post_url: + # The less cool way, just using file paths + + segment_dir = Path ( urllib.parse.urlparse(post_url).path ) + + logger.info (f"Writing to: {segment_dir}") + + os.makedirs(segment_dir, exist_ok=True) + + shutil.copy(input_feature_data, segment_dir.joinpath("data.parquet")) + + def cli_runner( cli_kwargs: dict, cli_params: List[tuple], cli_function: Callable[..., dict], pass_keys: bool = False @@ -365,6 +426,8 @@ def cli_runner( # if "output_dir" not in cli_kwargs.keys(): # raise RuntimeError("CLI Runners assume an output directory") + dataset_id = cli_kwargs.get("dataset_id", None) + # Get params from param file if cli_kwargs.get("method_param_path"): with open(cli_kwargs.get("method_param_path"), "r") as yaml_file: @@ -415,10 +478,24 @@ def cli_runner( else: kwargs['segment_keys'] = keys + # Save metadata on disk if "output_dir" in kwargs: with open(os.path.join(output_dir, "metadata.yml"), "w") as fp: yaml.dump(kwargs, fp) + # Save feature data in parquet + if dataset_id is not None: + + if "feature_data" in kwargs: + feature_data = kwargs.get("feature_data") + logger.info(f"Adding feature segment {feature_data} to {dataset_id}") + + dataset_url = get_dataset_url() + + if dataset_url is not None: + post_to_dataset( feature_data, dataset_url, dataset_id, keys=kwargs['segment_keys']) + + logger.info("Done.") diff --git a/pyluna-pathology/luna/pathology/cli/save_tiles.py b/pyluna-pathology/luna/pathology/cli/save_tiles.py index b97cda62..2a9547ff 100644 --- a/pyluna-pathology/luna/pathology/cli/save_tiles.py +++ b/pyluna-pathology/luna/pathology/cli/save_tiles.py @@ -22,6 +22,8 @@ help="Batch size used for inference speedup", default=64) @click.option('-m', '--method_param_path', required=False, help='path to a metadata json/yaml file with method parameters to reproduce results') +@click.option('-dsid', '--dataset_id', required=False, + help='Optional dataset identifier to add results to') def cli(**cli_kwargs): """Saves tiles to disk From 006a546d3a3c13906a98fd854b43df3ed17e910c Mon Sep 17 00:00:00 2001 From: Aukerman Date: Tue, 5 Apr 2022 13:36:49 -0400 Subject: [PATCH 03/18] Add dataset ID options --- .../luna/pathology/cli/extract_tile_statistics.py | 6 ++++++ .../luna/pathology/cli/run_stardist_cell_detection.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/pyluna-pathology/luna/pathology/cli/extract_tile_statistics.py b/pyluna-pathology/luna/pathology/cli/extract_tile_statistics.py index 850491fe..1fa4460d 100644 --- a/pyluna-pathology/luna/pathology/cli/extract_tile_statistics.py +++ b/pyluna-pathology/luna/pathology/cli/extract_tile_statistics.py @@ -29,6 +29,12 @@ required=False, help="path to a metadata json/yaml file with method parameters to reproduce results", ) +@click.option( + "-dsid", + "--dataset_id", + required=False, + help="Optional dataset identifier to add tabular output to", +) def cli(**cli_kwargs): """Extracts statistics over tiles diff --git a/pyluna-pathology/luna/pathology/cli/run_stardist_cell_detection.py b/pyluna-pathology/luna/pathology/cli/run_stardist_cell_detection.py index 586b87b3..fe6313f2 100644 --- a/pyluna-pathology/luna/pathology/cli/run_stardist_cell_detection.py +++ b/pyluna-pathology/luna/pathology/cli/run_stardist_cell_detection.py @@ -50,6 +50,12 @@ required=False, help="path to a metadata json/yaml file with method parameters to reproduce results", ) +@click.option( + "-dsid", + "--dataset_id", + required=False, + help="Optional dataset identifier to add tabular output to", +) def cli(**cli_kwargs): """Run stardist using qupath CLI within a docker container From a47d10476a6bf2e70c3b8e4b20e7063d0a0a0660 Mon Sep 17 00:00:00 2001 From: Aukerman Date: Tue, 5 Apr 2022 13:37:13 -0400 Subject: [PATCH 04/18] Revert uncomment --- .../cli/run_stardist_cell_detection.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pyluna-pathology/luna/pathology/cli/run_stardist_cell_detection.py b/pyluna-pathology/luna/pathology/cli/run_stardist_cell_detection.py index fe6313f2..50a39b38 100644 --- a/pyluna-pathology/luna/pathology/cli/run_stardist_cell_detection.py +++ b/pyluna-pathology/luna/pathology/cli/run_stardist_cell_detection.py @@ -113,20 +113,20 @@ def run_stardist_cell_detection( os.makedirs(output_dir, exist_ok=True) - # client = docker.from_env() - # container = client.containers.run( - # volumes={ - # input_slide_image: {"bind": f"/inputs/{slide_filename}", "mode": "ro"}, - # output_dir: {"bind": "/output_dir", "mode": "rw"}, - # }, - # nano_cpus=int(num_cores * 1e9), - # image=docker_image, - # command=command, - # detach=True, - # ) - - # for line in container.logs(stream=True): - # print(line.decode(), end="") + client = docker.from_env() + container = client.containers.run( + volumes={ + input_slide_image: {"bind": f"/inputs/{slide_filename}", "mode": "ro"}, + output_dir: {"bind": "/output_dir", "mode": "rw"}, + }, + nano_cpus=int(num_cores * 1e9), + image=docker_image, + command=command, + detach=True, + ) + + for line in container.logs(stream=True): + print(line.decode(), end="") stardist_output = os.path.join(output_dir, "cell_detections.tsv") From 8363d1f7758ed5a38ce586eabb52d290b757508c Mon Sep 17 00:00:00 2001 From: Aukerman Date: Tue, 5 Apr 2022 13:57:12 -0400 Subject: [PATCH 05/18] Move to parquet --- .../luna/pathology/cli/save_tiles.py | 10 +++++----- .../luna/pathology/common/schemas.py | 2 +- .../tests/luna/pathology/cli/test_save_tiles.py | 6 +++--- .../data/generate_tiles/123/123.tiles.parquet | Bin 0 -> 4945 bytes 4 files changed, 9 insertions(+), 9 deletions(-) create mode 100644 pyluna-pathology/tests/luna/pathology/cli/testdata/data/generate_tiles/123/123.tiles.parquet diff --git a/pyluna-pathology/luna/pathology/cli/save_tiles.py b/pyluna-pathology/luna/pathology/cli/save_tiles.py index 2a9547ff..d788343b 100644 --- a/pyluna-pathology/luna/pathology/cli/save_tiles.py +++ b/pyluna-pathology/luna/pathology/cli/save_tiles.py @@ -28,14 +28,14 @@ def cli(**cli_kwargs): """Saves tiles to disk Tiles addresses and arrays are saved as key-value pairs in (tiles.h5), - and the corresponding manifest/header file (tiles.csv) is also generated + and the corresponding manifest/header file (tiles.parquet) is also generated Adds tile_store to manifest for use in HDF5 Image loader \b Inputs: input_slide_image: slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...) - input_slide_tiles: path to tile images (.tiles.csv) + input_slide_tiles: path to tile images (.tiles.parquet) Outputs: slide_tiles @@ -75,11 +75,11 @@ def save_tiles(input_slide_image, input_slide_tiles, output_dir, num_cores, batc """Saves tiles to disk Tiles addresses and arrays are saved as key-value pairs in (tiles.h5), - and the corresponding manifest/header file (tiles.csv) is also generated + and the corresponding manifest/header file (tiles.parquet) is also generated Args: input_slide_image (str): path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...) - input_slide_tiles (str): path to a slide-tile manifest file (.tiles.csv) + input_slide_tiles (str): path to a slide-tile manifest file (.tiles.parquet) output_dir (str): output/working directory batch_size (int): size in batch dimension to chuck jobs @@ -87,7 +87,7 @@ def save_tiles(input_slide_image, input_slide_tiles, output_dir, num_cores, batc dict: metadata about function call """ slide_id = Path(input_slide_image).stem - df = pd.read_csv(input_slide_tiles).set_index('address') + df = pd.read_parquet(input_slide_tiles).set_index('address') output_header_file = f"{output_dir}/{slide_id}.tiles.parquet" output_hdf_file = f"{output_dir}/{slide_id}.tiles.h5" diff --git a/pyluna-pathology/luna/pathology/common/schemas.py b/pyluna-pathology/luna/pathology/common/schemas.py index 22404ba1..d952eda7 100644 --- a/pyluna-pathology/luna/pathology/common/schemas.py +++ b/pyluna-pathology/luna/pathology/common/schemas.py @@ -13,7 +13,7 @@ class SlideTiles: @classmethod def check(self, slide_tiles): """Returns True if the given path is readable as "SlideTiles ", else, reaises SchemaMismatchError""" - df = pd.read_csv(slide_tiles) + df = pd.read_parquet(slide_tiles).reset_index() if not set(df.columns).intersection(self.REQ_COLUMNS) == self.REQ_COLUMNS: raise SchemaMismatchError( diff --git a/pyluna-pathology/tests/luna/pathology/cli/test_save_tiles.py b/pyluna-pathology/tests/luna/pathology/cli/test_save_tiles.py index 616854bd..a5c646c9 100644 --- a/pyluna-pathology/tests/luna/pathology/cli/test_save_tiles.py +++ b/pyluna-pathology/tests/luna/pathology/cli/test_save_tiles.py @@ -12,7 +12,7 @@ def test_save_cli(tmp_path): cli, [ "pyluna-pathology/tests/luna/pathology/cli/testdata/data/123.svs", - "pyluna-pathology/tests/luna/pathology/cli/testdata/data/generate_tiles/123/123.tiles.csv", + "pyluna-pathology/tests/luna/pathology/cli/testdata/data/generate_tiles/123/123.tiles.parquet", "-o", tmp_path, "-bx", @@ -23,6 +23,6 @@ def test_save_cli(tmp_path): ) assert result.exit_code == 0 - assert os.path.exists(f"{tmp_path}/123.tiles.csv") + assert os.path.exists(f"{tmp_path}/123.tiles.parquet") - assert SlideTiles.check(f"{tmp_path}/123.tiles.csv") + assert SlideTiles.check(f"{tmp_path}/123.tiles.parquet") diff --git a/pyluna-pathology/tests/luna/pathology/cli/testdata/data/generate_tiles/123/123.tiles.parquet b/pyluna-pathology/tests/luna/pathology/cli/testdata/data/generate_tiles/123/123.tiles.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e6b98a4cfcad192187b78a29a915ddbf9ac194bf GIT binary patch literal 4945 zcmcgwOKclO7#f57eDsTs(&wblB5JJ63wI8VtBH@kj?OQ0ibqk>we#KwIvxg`Utwtx` zpnr%Lxv$q_)qdPDKp|?4dVL)`pysH#)h%@IM(6Pt7&TW962m=HDm)>6h!xCNI|u0OV_qIpgm_M#F-(JTgJeCLvkMy6A z@Lfp4)qp#9M4Wh}QzV|>8Y5wAx>{{%19gnVM>m=8U&SOmmM*YAGZIVG@-Px}cQJ9R zHywv9bkA?~X!uP*d~y4{jIR@jZBw(IO16U^Fwbr?KQ7}!9&Vq3z`ycA0?~6Vxvm-_ zwS=gpH~gcX@CMpj1)o#^%iJ1sXKrRXv-2Rk^U--Z_Qt@uCWG|K>WcnhL3|CO2 zS5iaFRpdBx30WF^c0EW#vcUH)f@VO^S)!?kE&A8_Cn3#LRGgc7+b~<92W#&>37vrq zMxmx?*3{7YTF4X)6$EBm*J0MO#ik9@Fc~*27)GLGtdjB5_h8}}tP<<0Yj@JjiKgmG zbyDWws@i=E`wKxm$_>SV&^E08NYRi^RoTYS4AmAD(H3F(laOk{a?Pyay~mH!_qRh& z$G}XY2)P$GHZlQaT=Adq;NyC_4zOH~A-4s1J&q}+JZh%B6(=E>;6DdyLoJ znBo%MjczKh!wJXt>#&R_xOQk0CT`b%k>A4z=@f;pvh3I*f?Xs{BzQhQ8hl|QE zN_ES{#96tt&$0)*J1~EfH}rNTBg9M5d`HR(wq)*WkF=De_H#`U*7B!~8gJ(6lA-9l z)~)eIqAnGW@tU?$?6vkdR+D3ex@;bTjP1Jfj?D2}{LXIQo$Jm#oh%(;8?G-sX-P&w zS2Mx@`zgj+nq;Ky$`S0%BqCC#+v2&LAr^&JC7o=SixC~KlT4u{abU+x0WRON)6cd(`b!uZ4Hmt|B2X%_i ze#W^aHPH}G-Pp{;=k}L=rI-#&G2X4|o>hh(CYxs@SD_!kIM-1eywOT2HxP@-HmUd1 z{q{Z=@A4+ZfXki7))4w~RPRU{^d^q)?Vawn^GCTP?yu0dS(ZD5w!4FyENkIchVEbi+-(16uqhRXJk`}~L5KW>IadQCn=oM6WBlUE?mWfT z@th@m9e!j^pn;83DVpq8*qy`4{X6{BesW29_NRyY@jcdMIbi4BXV{6u2R6ZF3HmbJ zYAjr6 Date: Tue, 5 Apr 2022 14:11:13 -0400 Subject: [PATCH 06/18] More parquet migration --- .../luna/pathology/cli/infer_tile_labels.py | 7 ++++--- .../pathology/cli/test_infer_tile_labels.py | 18 +++++++++--------- .../data/save_tiles/123/123.tiles.parquet | Bin 0 -> 6074 bytes .../testdata/data/save_tiles/123/metadata.yml | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) create mode 100644 pyluna-pathology/tests/luna/pathology/cli/testdata/data/save_tiles/123/123.tiles.parquet diff --git a/pyluna-pathology/luna/pathology/cli/infer_tile_labels.py b/pyluna-pathology/luna/pathology/cli/infer_tile_labels.py index a244f3e4..46e08ee9 100644 --- a/pyluna-pathology/luna/pathology/cli/infer_tile_labels.py +++ b/pyluna-pathology/luna/pathology/cli/infer_tile_labels.py @@ -143,7 +143,7 @@ def infer_tile_labels( transform = ttm.transform ttm.model.to(device) - df = pd.read_csv(input_slide_tiles).set_index("address") + df = pd.read_parquet(input_slide_tiles).reset_index().set_index("address") ds = HD5FDataset(df, preprocess=preprocess) loader = DataLoader( ds, num_workers=num_cores, batch_size=batch_size, pin_memory=True @@ -165,13 +165,14 @@ def infer_tile_labels( df_scores = df_scores.rename(columns=ttm.class_labels) df_output = df.join(df_scores) + df_output.columns = df_output.columns.astype(str) logger.info(df_output) output_file = os.path.join( - output_dir, "tile_scores_and_labels_pytorch_inference.csv" + output_dir, "tile_scores_and_labels_pytorch_inference.parquet" ) - df_output.to_csv(output_file) + df_output.to_parquet(output_file) # Save our properties and params properties = { diff --git a/pyluna-pathology/tests/luna/pathology/cli/test_infer_tile_labels.py b/pyluna-pathology/tests/luna/pathology/cli/test_infer_tile_labels.py index 05e7b92e..d5e8ec85 100644 --- a/pyluna-pathology/tests/luna/pathology/cli/test_infer_tile_labels.py +++ b/pyluna-pathology/tests/luna/pathology/cli/test_infer_tile_labels.py @@ -24,11 +24,11 @@ def test_cli(tmp_path): assert result.exit_code == 0 assert SlideTiles.check( - f"{tmp_path}" f"/tile_scores_and_labels_pytorch_inference.csv" + f"{tmp_path}" f"/tile_scores_and_labels_pytorch_inference.parquet" ) # Default to 2 channels.. - df = pd.read_csv(f"{tmp_path}/tile_scores_and_labels_pytorch_inference.csv") + df = pd.read_parquet(f"{tmp_path}/tile_scores_and_labels_pytorch_inference.parquet") assert df.shape == (12, 9) assert set(["Background", "Tumor"]).intersection(set(df.columns)) == set( @@ -56,10 +56,10 @@ def test_cli_kwargs(tmp_path): assert result.exit_code == 0 assert SlideTiles.check( - f"{tmp_path}" f"/tile_scores_and_labels_pytorch_inference.csv" + f"{tmp_path}" f"/tile_scores_and_labels_pytorch_inference.parquet" ) - df = pd.read_csv(f"{tmp_path}/tile_scores_and_labels_pytorch_inference.csv") + df = pd.read_parquet(f"{tmp_path}/tile_scores_and_labels_pytorch_inference.parquet") assert df.shape == (12, 17) # 8 more @@ -83,12 +83,12 @@ def test_cli_resnet(tmp_path): ) assert result.exit_code == 0 - assert SlideTiles.check(f"{tmp_path}/tile_scores_and_labels_pytorch_inference.csv") + assert SlideTiles.check(f"{tmp_path}/tile_scores_and_labels_pytorch_inference.parquet") - assert pd.read_csv( - f"{tmp_path}/tile_scores_and_labels_pytorch_inference.csv" + assert pd.read_parquet( + f"{tmp_path}/tile_scores_and_labels_pytorch_inference.parquet" ).shape == (12, 1007) - assert pd.read_csv( - f"{tmp_path}/tile_scores_and_labels_pytorch_inference.csv" + assert pd.read_parquet( + f"{tmp_path}/tile_scores_and_labels_pytorch_inference.parquet" ).shape == (12, 1007) diff --git a/pyluna-pathology/tests/luna/pathology/cli/testdata/data/save_tiles/123/123.tiles.parquet b/pyluna-pathology/tests/luna/pathology/cli/testdata/data/save_tiles/123/123.tiles.parquet new file mode 100644 index 0000000000000000000000000000000000000000..29b2de848e6784c242ff6e5fefe93729fe296378 GIT binary patch literal 6074 zcmd^DO>7(25q@MTS+Z4IHoPnesDMBu3m;^W%O6Dy9H2`|B$ttBiL&@t6vY0CTyb~F za+klhuSE_;&Or}FPd)U|Qw~K>MNd5$Ip$CpMPLL;4@J>aPDSUvUH&ea#)za#7Uu50 znfKIvwH`3rM{P|!hiSHkEFhD-=+Ocxs5W*i9_|%% z{s%;VzOqGmMjZN8vDs`(gu`o>_a83$uO9lFA8ow_z(PttqxO%eH9~3IqJ%u_x1Nl= zMr%uX*_t_C4(SZ9H*ek``lIzJUV|xK|8jZ%>7xI0i~j8`%*$bl1kNEv3wdo)505A} z;kC8}TSYF|od=Y6W_N7O91DjOmalJHyW{d9`s0lWh4ucRD)&uUH_7Jw$>sfvi~jFz z`qgbr!eNP!a7e!k3D;5*wg%wrHv;02&XM@!-V_N_QRS+k4CN^jAKj%t|1l=vu(ZVf z!kyTlHb*6t4?S|X`aby0x%m42B#mzq7dyIQTA6HmU(g@lr@z?5i9A}q z(1Sm^sNX=jvQ9&>eujM*s2!bu+~Uo8Q*G7;K~pwNBj}<~&`+(2s$1jEcM-Mmu6x`f-bF`S5EsgWWp>H6 zM|PpIMw6Vk$K&*|>`k53kx2|=iAh{-vDfF{^1rx_c69^o;|41EjZ|9tCMqF)1+|R+ zfh4K-?{~dEND6p9M9><2t1Vuac*FPg+Vg;-OEOMPwWH}q;Qj5NKMyoO25*5juh-Vwt)1L=q9i}UK4X-_amU_G&csb?FtQP_M zGZq_}ab;TZz5K!9dbtB`*&G9Q1#}`#DdrFr-Q0_kEtugy7EjT4z}Y3%2{)y-?q#H8c8cv z5!!rvfU~MK;M?tH594&@wxKk2ERF2Ke^Uc+)<@t|I%b#IjRjsJ-v*n{KP|p*eJS|9 z*$m=~Vbd(jp281bAF^zeP1M*F%huRrjmQ#-8GR2656Ais@jLn1vxuH!*&Yk);7N?Y zb2O<8Z;4*H7;BVE$1Iy}!1}~{oThdv=|Z#=&UJ-M!4&jk<*AYoyC< zUC<DAqjT$1o#Em zOgto)5=6)8g$`fLbt~y}1!KcpQ_QFrQbz3&yUA>~thLl~Cj*0Dxrm8|Ze=+8i%+@RE{Gd|G z4M{Ao-hZ+u7Z79EzlmRZ{Q@|3Ma|;8z%Oh6BiY7Mzu=rm2j!4`1{k4~h?OGwmB)tuGddGXMwQYj$jN+te6}+ogUd|iLNXiTi+!~e$+cw7I@b&K&@o3$ry#de!=?M= z%#CTk%?kJg>qF&nKKA2BNt`328o(-2S~V(#I3s9kNE|GSS-vHt3l~x`){-(-ZY=T* z>v8Tup5oXqh+ACYwZf&nH*e$LP)f%Kwp~X(;dIMCsAG^LR(!9^KfC@7D#cVth;Vkz z>9@*IV@16|atZ1cyl1*GTP&GI3vmIp66W3oW){)ESD6ASG#3N^ZvAF}L06yqJAz*6ka zDM$KY22UTiMwU%tX~y28u)0^lkqiO^`r?;#m`!w8m-5Dy2mDgo!zeq z9N`0t;9~&vWgJRq*rbmW6?SRK{t)}n&l;ju7>Yd{|5Mxl4?aLX_)2O2vLl<1#a3(E k{#4n9uasTp@mY>NIml_y7O^ literal 0 HcmV?d00001 diff --git a/pyluna-pathology/tests/luna/pathology/cli/testdata/data/save_tiles/123/metadata.yml b/pyluna-pathology/tests/luna/pathology/cli/testdata/data/save_tiles/123/metadata.yml index d2d98d8e..f76950d3 100644 --- a/pyluna-pathology/tests/luna/pathology/cli/testdata/data/save_tiles/123/metadata.yml +++ b/pyluna-pathology/tests/luna/pathology/cli/testdata/data/save_tiles/123/metadata.yml @@ -3,7 +3,7 @@ input_slide_image: pyluna-pathology/tests/luna/pathology/cli/testdata/data/123.s input_slide_tiles: pyluna-pathology/tests/luna/pathology/cli/testdata/data/generate_tiles/123/123.tiles.csv num_cores: 1 output_dir: pyluna-pathology/tests/luna/pathology/cli/testdata/data/save_tiles/123 -slide_tiles: pyluna-pathology/tests/luna/pathology/cli/testdata/data/save_tiles/123/123.tiles.csv +slide_tiles: pyluna-pathology/tests/luna/pathology/cli/testdata/data/save_tiles/123/123.tiles.parquet total_tiles: 12 segment_keys: slide_id: '123' From e713afca8f48cb61955fca3a4c80d0fa5f76d609 Mon Sep 17 00:00:00 2001 From: Aukerman Date: Tue, 5 Apr 2022 14:20:45 -0400 Subject: [PATCH 07/18] Further migrate to parquet --- pyluna-common/luna/common/utils.py | 2 ++ .../luna/pathology/cli/generate_tile_labels.py | 10 +++++----- pyluna-pathology/luna/pathology/cli/generate_tiles.py | 4 ++-- .../luna/pathology/cli/test_generate_tile_labels.py | 2 +- .../tests/luna/pathology/cli/test_generate_tiles.py | 4 ++-- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pyluna-common/luna/common/utils.py b/pyluna-common/luna/common/utils.py index d268a479..7cb0ad65 100644 --- a/pyluna-common/luna/common/utils.py +++ b/pyluna-common/luna/common/utils.py @@ -466,6 +466,8 @@ def cli_runner( result = cli_function(**kwargs) + logger.info("Transform code completed, running post-transform functions now...") + kwargs.update(result) # filter out kwargs with sensitive data diff --git a/pyluna-pathology/luna/pathology/cli/generate_tile_labels.py b/pyluna-pathology/luna/pathology/cli/generate_tile_labels.py index 64746de8..3178f52d 100644 --- a/pyluna-pathology/luna/pathology/cli/generate_tile_labels.py +++ b/pyluna-pathology/luna/pathology/cli/generate_tile_labels.py @@ -41,7 +41,7 @@ def cli(**cli_kwargs): \b Inputs: input_slide_annotation_dataset: annotation dataset containing metadata about geojsons - input_slide_tiles: path to tile images (.tiles.csv) + input_slide_tiles: path to tile images (.tiles.parquet) \b Outputs: slide_tiles @@ -63,7 +63,7 @@ def generate_tile_labels( Args: input_slide_annotation_dataset (str): path to parquet annotation dataset - input_slide_tiles (str): path to a slide-tile manifest file (.tiles.csv) + input_slide_tiles (str): path to a slide-tile manifest file (.tiles.parquet) output_dir (str): output/working directory keys (dict): segment keys (this function needs 'slide_id') Returns: @@ -107,7 +107,7 @@ def generate_tile_labels( for label in d_collections.keys(): d_collections[label] = GeometryCollection(d_collections[label]) - df_tiles = pd.read_csv(input_slide_tiles).set_index("address") + df_tiles = pd.read_parquet(input_slide_tiles).reset_index().set_index("address") l_regional_labels = [] l_intersection_areas = [] @@ -140,8 +140,8 @@ def generate_tile_labels( logger.info(df_tiles.loc[df_tiles.intersection_area > 0]) - output_header_file = f"{output_dir}/{slide_id}.regional_label.tiles.csv" - df_tiles.to_csv(output_header_file) + output_header_file = f"{output_dir}/{slide_id}.regional_label.tiles.parquet" + df_tiles.to_parquet(output_header_file) properties = { "slide_tiles": output_header_file, # "Tiles" are the metadata that describe them diff --git a/pyluna-pathology/luna/pathology/cli/generate_tiles.py b/pyluna-pathology/luna/pathology/cli/generate_tiles.py index ce256ed3..082eb9e8 100644 --- a/pyluna-pathology/luna/pathology/cli/generate_tiles.py +++ b/pyluna-pathology/luna/pathology/cli/generate_tiles.py @@ -102,8 +102,8 @@ def generate_tiles(input_slide_image, tile_size, requested_magnification, output logger.info(df) - output_header_file = f"{output_dir}/{slide_id}.tiles.csv" - df.to_csv(output_header_file) + output_header_file = f"{output_dir}/{slide_id}.tiles.parquet" + df.to_parquet(output_header_file) properties = { "slide_tiles": output_header_file, # "Tiles" are the metadata that describe them diff --git a/pyluna-pathology/tests/luna/pathology/cli/test_generate_tile_labels.py b/pyluna-pathology/tests/luna/pathology/cli/test_generate_tile_labels.py index 4ce6fb75..5b319dde 100644 --- a/pyluna-pathology/tests/luna/pathology/cli/test_generate_tile_labels.py +++ b/pyluna-pathology/tests/luna/pathology/cli/test_generate_tile_labels.py @@ -19,7 +19,7 @@ def test_cli(tmp_path): assert result.exit_code == 0 - out_tile = pd.read_csv(f"{tmp_path}/123.regional_label.tiles.csv").set_index('address') + out_tile = pd.read_parquet(f"{tmp_path}/123.regional_label.tiles.parquet").reset_index().set_index('address') assert out_tile.loc["x1_y1_z10.0", "regional_label"] == "Other" assert out_tile.loc["x3_y4_z10.0", "regional_label"] == "Tumor" diff --git a/pyluna-pathology/tests/luna/pathology/cli/test_generate_tiles.py b/pyluna-pathology/tests/luna/pathology/cli/test_generate_tiles.py index 622274c6..bab38441 100644 --- a/pyluna-pathology/tests/luna/pathology/cli/test_generate_tiles.py +++ b/pyluna-pathology/tests/luna/pathology/cli/test_generate_tiles.py @@ -22,9 +22,9 @@ def test_cli(tmp_path): ) assert result.exit_code == 0 - assert os.path.exists(f"{tmp_path}/123.tiles.csv") + assert os.path.exists(f"{tmp_path}/123.tiles.parquet") - assert SlideTiles.check(f"{tmp_path}/123.tiles.csv") + assert SlideTiles.check(f"{tmp_path}/123.tiles.parquet") def test_cli_bad_mag(tmp_path): From 326f28c829662194b93b0b8bef52258961bf76aa Mon Sep 17 00:00:00 2001 From: Aukerman Date: Tue, 5 Apr 2022 14:37:00 -0400 Subject: [PATCH 08/18] Parquet migration tests pass --- pyluna-common/luna/common/stats.py | 6 +++--- .../cli/extract_kfunction_statistics.py | 4 ++-- .../pathology/cli/extract_tile_statistics.py | 5 +++-- .../luna/pathology/cli/generate_tile_mask.py | 6 +++--- .../cli/test_extract_kfunction_statistics.py | 6 +++--- .../cli/test_extract_tile_statistics.py | 10 ++++------ .../data/infer_tumor_background/123/metadata.yml | 2 +- ...e_scores_and_labels_pytorch_inference.parquet | Bin 0 -> 7583 bytes .../123_cell_objects.parquet | Bin 0 -> 7313 bytes .../data/stardist_cell_detection/metadata.yml | 3 ++- .../cli/testdata/test_tile_stats.parquet | Bin 0 -> 5307 bytes 11 files changed, 21 insertions(+), 21 deletions(-) create mode 100644 pyluna-pathology/tests/luna/pathology/cli/testdata/data/infer_tumor_background/123/tile_scores_and_labels_pytorch_inference.parquet create mode 100644 pyluna-pathology/tests/luna/pathology/cli/testdata/data/stardist_cell_detection/123_cell_objects.parquet create mode 100644 pyluna-pathology/tests/luna/pathology/cli/testdata/test_tile_stats.parquet diff --git a/pyluna-common/luna/common/stats.py b/pyluna-common/luna/common/stats.py index 99dd7196..e4fff50b 100644 --- a/pyluna-common/luna/common/stats.py +++ b/pyluna-common/luna/common/stats.py @@ -14,7 +14,7 @@ def compute_stats_1d(vec, fx_name_prefix, n_percentiles=4): dict: summary statistics """ n, _, sm, sv, ss, sk = scipy.stats.describe(vec) - ln_params = scipy.stats.lognorm.fit(vec, floc=0) + # ln_params = scipy.stats.lognorm.fit(vec, floc=0) hist_features = { f'{fx_name_prefix}_nobs': n, @@ -22,8 +22,8 @@ def compute_stats_1d(vec, fx_name_prefix, n_percentiles=4): f'{fx_name_prefix}_variance': sv, f'{fx_name_prefix}_skewness': ss, f'{fx_name_prefix}_kurtosis': sk, - f'{fx_name_prefix}_lognorm_fit_p0': ln_params[0], - f'{fx_name_prefix}_lognorm_fit_p2': ln_params[2] + # f'{fx_name_prefix}_lognorm_fit_p0': ln_params[0], + # f'{fx_name_prefix}_lognorm_fit_p2': ln_params[2] } percentiles = np.linspace(0, 100, n_percentiles + 1) diff --git a/pyluna-pathology/luna/pathology/cli/extract_kfunction_statistics.py b/pyluna-pathology/luna/pathology/cli/extract_kfunction_statistics.py index e3e2ed2f..f2e688ad 100644 --- a/pyluna-pathology/luna/pathology/cli/extract_kfunction_statistics.py +++ b/pyluna-pathology/luna/pathology/cli/extract_kfunction_statistics.py @@ -116,8 +116,8 @@ def extract_kfunction(input_cell_objects, tile_size, intensity_label, tile_strid logger.info("Generated k-function feature data:") logger.info (df_stats) - output_tile_header = os.path.join(output_dir, Path(input_cell_objects).stem + '_kfunction_supertiles.csv') - df_stats.to_csv(output_tile_header) + output_tile_header = os.path.join(output_dir, Path(input_cell_objects).stem + '_kfunction_supertiles.parquet') + df_stats.to_parquet(output_tile_header) properties = { 'slide_tiles': output_tile_header, diff --git a/pyluna-pathology/luna/pathology/cli/extract_tile_statistics.py b/pyluna-pathology/luna/pathology/cli/extract_tile_statistics.py index 1fa4460d..37ec4de9 100644 --- a/pyluna-pathology/luna/pathology/cli/extract_tile_statistics.py +++ b/pyluna-pathology/luna/pathology/cli/extract_tile_statistics.py @@ -65,9 +65,10 @@ def extract_tile_statistics(input_slide_tiles, output_dir): """ df = ( - pd.read_csv(input_slide_tiles) + pd.read_parquet(input_slide_tiles) + .reset_index() .set_index("address") - .drop(columns=["x_coord", "y_coord", "tile_size", 'xy_extent', 'tile_units']) + .drop(columns=["x_coord", "y_coord", "tile_size", 'xy_extent', 'tile_units'], errors='ignore') ) print(df.columns) diff --git a/pyluna-pathology/luna/pathology/cli/generate_tile_mask.py b/pyluna-pathology/luna/pathology/cli/generate_tile_mask.py index bc18ca92..5c1a46bc 100644 --- a/pyluna-pathology/luna/pathology/cli/generate_tile_mask.py +++ b/pyluna-pathology/luna/pathology/cli/generate_tile_mask.py @@ -54,9 +54,9 @@ def cli(**cli_kwargs): output data \b Example: - convert_tiles_to_mask ./slides/10001.svs ./tiles_scores_and_labels.csv + convert_tiles_to_mask ./slides/10001.svs ./tiles_scores_and_labels.parquet -lc Background,Tumor - -o ./label_mask.csv + -o ./label_mask.parquet """ cli_runner(cli_kwargs, _params_, convert_tiles_to_mask) @@ -93,7 +93,7 @@ def convert_tiles_to_mask( # check if tile_col is a valid argument logger.info("Reading SlideTiles") print(label_cols) - tile_df = pd.read_csv(input_slide_tiles).set_index("address") + tile_df = pd.read_parquet(input_slide_tiles).reset_index().set_index("address") if not set(label_cols).issubset(tile_df.columns): raise ValueError(f"Invalid label_cols={label_cols}, verify input dataframe") diff --git a/pyluna-pathology/tests/luna/pathology/cli/test_extract_kfunction_statistics.py b/pyluna-pathology/tests/luna/pathology/cli/test_extract_kfunction_statistics.py index 832b2d76..be06e324 100644 --- a/pyluna-pathology/tests/luna/pathology/cli/test_extract_kfunction_statistics.py +++ b/pyluna-pathology/tests/luna/pathology/cli/test_extract_kfunction_statistics.py @@ -10,7 +10,7 @@ def test_cli(tmp_path): result = runner.invoke( cli, [ - "pyluna-pathology/tests/luna/pathology/cli/testdata/test_tile_stats.csv", + "pyluna-pathology/tests/luna/pathology/cli/testdata/test_tile_stats.parquet", "-o", tmp_path, "-il", @@ -25,8 +25,8 @@ def test_cli(tmp_path): ) assert result.exit_code == 0 - assert os.path.exists(f"{tmp_path}/test_tile_stats_kfunction_supertiles.csv") + assert os.path.exists(f"{tmp_path}/test_tile_stats_kfunction_supertiles.parquet") - df = pd.read_csv(f"{tmp_path}/test_tile_stats_kfunction_supertiles.csv") + df = pd.read_parquet(f"{tmp_path}/test_tile_stats_kfunction_supertiles.parquet") assert "ikfunction_r160.0_stainCentroid_X_µm" in df.columns assert df["ikfunction_r160.0_stainCentroid_X_µm_norm"].values[0] == 1.0 diff --git a/pyluna-pathology/tests/luna/pathology/cli/test_extract_tile_statistics.py b/pyluna-pathology/tests/luna/pathology/cli/test_extract_tile_statistics.py index 7d365920..15de85ba 100644 --- a/pyluna-pathology/tests/luna/pathology/cli/test_extract_tile_statistics.py +++ b/pyluna-pathology/tests/luna/pathology/cli/test_extract_tile_statistics.py @@ -5,20 +5,20 @@ from luna.pathology.cli.extract_tile_statistics import cli -def test_cli_generate_mask(tmp_path): +def test_cli_extract_tile_statistics(tmp_path): runner = CliRunner() result = runner.invoke( cli, [ - "pyluna-pathology/tests/luna/pathology/cli/testdata/test_tile_stats.csv", + "pyluna-pathology/tests/luna/pathology/cli/testdata/test_tile_stats.parquet", "-o", tmp_path, ], ) assert result.exit_code == 0 - assert os.path.exists(f"{tmp_path}/test_tile_stats_tile_stats.csv") - df = pd.read_csv(f"{tmp_path}/test_tile_stats_tile_stats.csv") + assert os.path.exists(f"{tmp_path}/test_tile_stats_tile_stats.parquet") + df = pd.read_parquet(f"{tmp_path}/test_tile_stats_tile_stats.parquet") cols = df.columns for col in [ "Centroid X µm_nobs", @@ -26,8 +26,6 @@ def test_cli_generate_mask(tmp_path): "Centroid X µm_variance", "Centroid X µm_skewness", "Centroid X µm_kurtosis", - "Centroid X µm_lognorm_fit_p0", - "Centroid X µm_lognorm_fit_p2", "Centroid X µm_pct0", "Centroid X µm_pct25", "Centroid X µm_pct50", diff --git a/pyluna-pathology/tests/luna/pathology/cli/testdata/data/infer_tumor_background/123/metadata.yml b/pyluna-pathology/tests/luna/pathology/cli/testdata/data/infer_tumor_background/123/metadata.yml index e4137511..0c4c6a38 100644 --- a/pyluna-pathology/tests/luna/pathology/cli/testdata/data/infer_tumor_background/123/metadata.yml +++ b/pyluna-pathology/tests/luna/pathology/cli/testdata/data/infer_tumor_background/123/metadata.yml @@ -14,5 +14,5 @@ kwargs: {} model_name: test_custom_model num_cores: 4 output_dir: pyluna-pathology/tests/luna/pathology/cli/testdata/data/infer_tumor_background/123/ -slide_tiles: pyluna-pathology/tests/luna/pathology/cli/testdata/data/infer_tumor_background/123/tile_scores_and_labels_pytorch_inference.csv +slide_tiles: pyluna-pathology/tests/luna/pathology/cli/testdata/data/infer_tumor_background/123/tile_scores_and_labels_pytorch_inference.parquet total_tiles: 12 diff --git a/pyluna-pathology/tests/luna/pathology/cli/testdata/data/infer_tumor_background/123/tile_scores_and_labels_pytorch_inference.parquet b/pyluna-pathology/tests/luna/pathology/cli/testdata/data/infer_tumor_background/123/tile_scores_and_labels_pytorch_inference.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9258c0ef3b8fe22b8f1b2062d72d94a286c8b4ca GIT binary patch literal 7583 zcmd^EZERat8NT+MG)=;qR=M7dnvGzsS4C^%uQ*MnO5U{-JNA;aaTqt}Rre0VD<$urf^*8U+gC2SNxg#jf_nh~6-uImMyvONDCgLP5D(hu((pL_BZH{qe1%XQFp33p&1|va*TZR2m&B`$==aP z;_yJFsip>6zyNyaxEbD*$t4#RW} zo`45giNhVFpc*`)`C;xMR$e9O8$F|>rS3x|#piMb0sC;%WPRU6U$W5ud|>n*06Mz# z1UWTN_F%cTH97f*KG5`Acg6B$F$WkQ3QQ{u<63y zN;M@`ilS1)i}Qxbdcj10)K72qBPR@&uoDL9$8o~c)d^h#aC!@a7^K^rcyV8g6UD3~ zrqt}F*y6;w2k9H1Ku#Deb+O-ZB?if%`bzW;BggZV?dvc}95&}G6nwT(yt%)b#{HOL zNy!$qOfFiVrO)rDKNv!ZT%X>N!S_t$*WtL*c0(+)j5t)KlENO%v&BqK%B8FJqNo;C zyNM+1BV->hWqU@31h&Z9dwMb|TNYCYT(vu0ZuA+)z445vgCombOd~vd6AvITCI-Q; zyTDNQ!|1k9zy8qnP^-mULE8>DE!Hgy{f%Mz4+l`N?+2Uv<6_6{aKS`gZB(z%{4F(` zx%81`^Jlg64}SCT*7Efld*{k~Oy;Mxue?2U`F9^&t9|k9celQ*T&(@|_w47Mvs|cs z&9pIa``KEJJN4T82cEiId*schGao*;Rr^-{Y3jn&Yqf#+L#L_p*K21k4^JKZ=CvAZ zlD29__0=s%QHR=g?vYWlU#|=OK9*lk7jh*oNm~faq2l$l5+)owrL@H>r?bEKM5f+8tVelAw=ep z?|KNTSLHLJ%#sHPa`2G3aoadV%#Q*u(t>1q_11fO&C`0#@}c(|C$VO)<=Cs6L@=5d zr)XW=kHpRWK#)kFIZjxqkc@^rGC->j4i7OB07GglB}q<$H$&}#Ma`4^5{Fw?*7Odj>k1!4$SG>UP`BtY$??NVUr}Ua4X= zkML}8nd5VUMxi9O)4_sI)5nCq5`#XO93a-A9{#${P_|6x$H9N^x=mFF~9C4RMS61~|Qp7~xtJI|$eK znPUWOQ|%yRqZAq;+f+LZiGO5t!EETZf!^C*HY}#*54v%8!NsK>>2?;kd0#N%w(GXx zu1Yz!*b3z-qKA>IKvd`=Za7NUxe4=KwP*g_SZ9{DO;DC9N}epxK2*_?u; z?s53*l?`&b9^@9CbKE)Zv7IHTK_(J85qn&_^-&jw?dX|I&oKU~M;2d3l8$cBz3P3Ji|B8kE1Cc=ri)hs6m zipgb|8}vB2;D#FVhh;VvRg;0~QX=M*&^S(ss+=F}C_dl?*v@H(824daNQsu%*kU;u zT*)GA7+d7$q*Y-~+Q4@G;c`OGO9}UqrjJD(iC82f1ZR;?5RYgn;hJ6t-xch;6kUmA z9TOqN+0MNgrv|T(99U1zg(cp-fNdvbN#cT$)rL?0a3&dyRFm<=O&rUe=a1K96=DqQ z*W{&SR>7w-FKa$`;H9m6Cq|$pwGd!Nw|5+`X6=Wo=(~SVO}YJ{W-94s3Q^Cu453@-8NU6U=uc z8u?h6p7*01shob6Re6;)e7n_aIME>^sD5 zI?Kw@HGOUFh2N$SoUZD24d+Qn(fGkR205a|cenU0$$>T2B~&?gw3O_&=hnlSgsZ6V zGK#a7W1aXV!4P2lG!1<=Kv2ZmJ4}0)At@u39hB)$=iy%jksz2E$0E}F@(1{15+`7KPIh(riDME>uWuuP+!&Y7-j__ z_}1(AOa}Ft?f79N6rr9c;~7U1Y7Xk>PQom_5;(k!T<TMocyJT9Fg38D*F?rq zAFnMBQ?2*)`S=oLnBS*8EFR_a`7!;}Qap%$HMdsx4+=){oFTo!ZyZs}+1V8&v!hg;F E2Lr77LjV8( literal 0 HcmV?d00001 diff --git a/pyluna-pathology/tests/luna/pathology/cli/testdata/data/stardist_cell_detection/123_cell_objects.parquet b/pyluna-pathology/tests/luna/pathology/cli/testdata/data/stardist_cell_detection/123_cell_objects.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6ecdc9a29f58271c2b6b9274d635a634c62ca5c7 GIT binary patch literal 7313 zcmeHM+ixRR8J~%q;FOf6+c-E<5GU?twJT+3XU<&DD*G^YY-ilq**J-B6jo!8*Pdj2 zsV{MyvJZ>I15hE~_0bM(B3K=~o(cB=SVT9FHVbtl9~$SRk- z&&VsQYbt;WAxYtQNuGugw%3yR388rHw+BN~f>0Bu49WWMYd36B10PU1D`%}2}}hUK4s z#75Zn+3zi50nkJ%;I$w^vc_rhblayY>~~&#r2Ds22}5ikn*c*lqNx#I)D{3^qN;xT zCq5b9vos8N;sFL=7re^>CV&E$*JzCgW&$(I3Ul>cmex4E9$Vxd?A6fEz1A ze7W=^@f5d%jc94OKm&d06w+KP;KPHP$3tM>XdWaNOIV`O=K+2|lk@9S@Og>#P0!=K zBR{-f`5>?ym|K#4%*;~ouTv?r2DFJ##I0#C-E5WX9fZ=`1|3GncS?;$%R>FZbEDiT zmpV33Y_ruUnnl~OJAKegbxTgU+rEkPR@p?q`(^DL;4i&Kf~|7B(y3C>89~HumAa1E z@)jCT*ljqrH5fd-P|`d`mJN1yfci(vo!TjB*Dq9vlpSZnKyTobVcx(QgJ5rOd~A^S z{Q^Sbcx|h~~EPR0}2FQax82b1H8?c5)g~H_- z7%V~7uk(TD7y)h!w+Occw+y!uc;f02meyq^IMXgw%QZ9LV*+!lQ_t#@7Vgup(^Y0`nZk&M ze3-_;*y5$7<)yV}NNOXPzrhj>Hy!@a4rzP%H3UV3?(~;7rr3 zTV^}>_u1D&cHJt|Q-#y5)!U()=xd9apLBieC@aq1S_Az3Vujan~+8 z)}6^QZ}~f&ev>MOx^=tMu*xCOu6JwA{z%X$9>D2^j>GA}w9JkP>aT~&bx^nK6*_l+ zFLqrE-Mcq7X3~n#(>XD{+U5D8{Mb{|K>Q?6P7_N$?(}p<$*Xzyh&MKPf(V8ayuttJ zBsE-Oc@ez{N+POMGpSN3W24=%}>b$PT=IwJtjD(r_rzO z@98iPp6(*>94h7EE!@jz<->d~sq5Pfm>X7&n$s=BGg1zxj*6R^PO+Y}x9xDzsHL1@ z{P?b!-LZ4Ky$XEyTYiXw(&zu+sjY?K*+r?U}Q`j{WBQ6WYc&}|l5;ZfMX%}Lo zo6iak)hpIB?P3JhsfVjDzgiLqU(SVHnNiPl&1~wZ5Z|}ABDxK_ozkXr*V=S??z)l0 zQNGr6^5Tvs&t~|1HeI#iQEDsrkm=?zIW)6!o$h5gnfU&ux)xy}7vGHwXPjMS(5(tEF- zIVi?6Lg}PY*^6ho1u@+)V#h*qBW$O#ysHCtZClw?yO7JAfZrQtE^%OG!RFdvuSxk{ z5g#`TLp$7kCg!K|xk?4(;D3fM ziZzQjBjJ*@r}y#)na+4#s#&U~07tRDnM=#V_>AN+z+v1u^km_ULiV@~vG{afM*9j) z$PwTK@S00ICHvI=HpO_M3F{r^u!{-6mTvF4*rquOZ-7lFr9QmJ!VvRJ*MgYtACiP; zW7w-9iV46j;PI$b+o`1E5Zi2uH?yQ`Y=-+qoT%n&PJ8Iv>Add%=JLB^n{o%nMR`va z0OwY`az0-ZSi2yOr{xoI^e{MyRiRprJSJ8H%$~tx%FIfiiKiV5Jh0KIT;%Ieb`k!E zvK{R7^J1nA_1UyG4+rs~wd<|Bk4f9irajp>@1N?+aPHpY@$K?XDINo^jGg4fv|IwN zqWG|~a+7io@D$~}u9X#f<;*E=13oZozz-2uPH|U-HTe99FOSJqgmTyzb9w4Zj5G`0 zoO4`ynwu$KJ}tWj+zFf&W8lXz|-)aII{Y#f4GoKoVZx0JbaMs zCL>bMsPi;7gWi+6Yt=IS5~h3!JtJZqZyiLtsoeyD1{I@Bb0be%CA@^gX)4ia z6QliYc+noJkK(0=8#j(_+_7tx7~h4 z@f}S8#!@G=w)cUHpyqhM`|ch;z}#kaRO68EVPA++UdfB;W)Vx3(kAU&G26@@53oAH zzhU5zs*!xhVIgC+R-|SVj~$2gstAvr*q+qM#!lit7NYr!J@L%g zc*b^|#nK81#9NVgK+7Z0*S;0)6IvAswN=}f7SRWkT_nT{JfKP|l>!y#-WfaoORCaU z7$tM>J?GqWzH{!m=S;H0WNFGytx>nrl#Q}G3F2#&=Lv#H)p#}-xVd(V4ut~Y;2M2v zZCP*W6hS#D*Al$!)HF5iStAyoKS$Jk9s{(`cx~`@lBARH|Ip^4<`%+sVtT=Sl@rB= zr0bN;<#c^ax;`cS-}5{EH!BNEfYDCb2*8X60u2FbISs&(pKd!z%#bf5270cT1Cn-A zo-+^KjvmMDAT1deu{ePW%(*4gFWb}$tc~zLobpiio@>VkLao*iv1_NK>j~-qrQQG0 z70(wi@c(rw`Wm=&>KJgTGv?A;SN)$&e`%NIsT;j)ba)6W>Z7UWcJw%BGjuTHR5+Ny zg!{{LXD-YbvLYSm@`;4cMMEXCR_5XSnoEX=1j{b?~@>7z!Po~jLfn-ZXQR>`( z`7|1W=Vu?CMh{8SEJXfCCzBmWYJsj zzWIiV1M!Z{^;?_&{cHZ$zXE|6*LseviGjFJQa^@3%;P{r|MBg+ufF+o@lW08@18E+ zef86BH1+3`8?XPX8~xGWG7rE0pKkP9vw!?m{b@G}I0@q4-6&!w|7pp8)%^4-)O@et>M91FrQ_ZHUs-$#7bc72^C=$YZvq3YO$~dQvvqIzR9Xevy{xO z7J}O?K9w_gZA-o*$9So;+2BByJ*t&iZL`9wqQc7UGOLCwe10FvG{yX}o{lrJ5Xx2r zEdw;BZLM2EoLytrt2!IsJQ0(*Hqx_Iphd(l1|zzdr<;YKX}eVD+iVu|w1Q-LEvNHw zWUo{&<&Ud{-~_wL1Yl2`6hhgskVKdmScmNr?#f6zP;qpT}`& zl|zVGOHecC;(}ZVY#*6?I2%JqDTiW8X+xaXz}GsjWrEw2VtO8?+~;9ZE*+DYRP<6V za>5nrR<4ic9^}`kj245t0r^i4=9tnH*NXw`zR-Lj79Pyep;+c~0mz$*nB3^!sl(lN z8SdX%-c+|0Udwj)pi*TM<#Hjl-`+~HG;ikk_<3F|Z`FY9!Shr3WTbKt=8}5cDi*%j z8E1mIrkL7narsO+zZ(<8J*Hhe${A){#$$V-{KPnybUv84C@)9+q7^ZjfclrM^lMDY zs!5?u1?Vk7HtPVNRAfO-bR;vs+fqK!5!6TsW7kTGpl#Rr;3}>WTbQw-6)l2bMiQRgI=AjpKTF_sRx-wHktp*7)$9NOAhxOwna|aDW zagVRZjeG!e0Vnj2qKOw#%CTx+Uj>T@=slos6RijD2wK;6U>9}|wLFGVd+Q8~J694@ zd#w=kAz7#|R!w38J$c+R!N1 Date: Tue, 5 Apr 2022 16:05:34 -0400 Subject: [PATCH 09/18] Forgot about run detection --- pyluna-common/luna/common/utils.py | 17 ++++++++++------- .../luna/pathology/cli/run_tissue_detection.py | 6 +++--- .../luna/pathology/cli/save_tiles.py | 2 +- .../pathology/cli/test_run_tissue_detection.py | 8 ++++---- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/pyluna-common/luna/common/utils.py b/pyluna-common/luna/common/utils.py index 7cb0ad65..9d413d86 100644 --- a/pyluna-common/luna/common/utils.py +++ b/pyluna-common/luna/common/utils.py @@ -453,12 +453,10 @@ def cli_runner( logger.info (f"Full segment key set: {keys}") # Nice little log break - print( - "\n" - + "-" * 35 - + f" Running transform::{cli_function.__name__} " - + "-" * 35 - + "\n" + logger.info( + "-" * 60 + + f"\n Starting transform::{cli_function.__name__} \n" + + "-" * 60 ) with CodeTimer(logger, name=f"transform::{cli_function.__name__}"): @@ -466,7 +464,12 @@ def cli_runner( result = cli_function(**kwargs) - logger.info("Transform code completed, running post-transform functions now...") + # Nice little log break + logger.info( + "-" * 60 + + f"\n Done with transform, running post-transform functions... \n" + + "-" * 60 + ) kwargs.update(result) diff --git a/pyluna-pathology/luna/pathology/cli/run_tissue_detection.py b/pyluna-pathology/luna/pathology/cli/run_tissue_detection.py index 8038dbcc..be4d9652 100644 --- a/pyluna-pathology/luna/pathology/cli/run_tissue_detection.py +++ b/pyluna-pathology/luna/pathology/cli/run_tissue_detection.py @@ -131,7 +131,7 @@ def detect_tissue(input_slide_image, input_slide_tiles, requested_magnification, """ slide = openslide.OpenSlide(input_slide_image) slide_id = Path(input_slide_image).stem - df = pd.read_csv(input_slide_tiles).set_index('address') + df = pd.read_parquet(input_slide_tiles).reset_index().set_index('address') logger.info (f"Slide dimensions {slide.dimensions}") @@ -208,9 +208,9 @@ def detect_tissue(input_slide_image, input_slide_tiles, requested_magnification, logger.info (df) - output_header_file = f"{output_dir}/{slide_id}-filtered.tiles.csv" + output_header_file = f"{output_dir}/{slide_id}-filtered.tiles.parquet" - df.to_csv(output_header_file) + df.to_parquet(output_header_file) properties = { "slide_tiles": output_header_file, diff --git a/pyluna-pathology/luna/pathology/cli/save_tiles.py b/pyluna-pathology/luna/pathology/cli/save_tiles.py index d788343b..cb0f09c2 100644 --- a/pyluna-pathology/luna/pathology/cli/save_tiles.py +++ b/pyluna-pathology/luna/pathology/cli/save_tiles.py @@ -87,7 +87,7 @@ def save_tiles(input_slide_image, input_slide_tiles, output_dir, num_cores, batc dict: metadata about function call """ slide_id = Path(input_slide_image).stem - df = pd.read_parquet(input_slide_tiles).set_index('address') + df = pd.read_parquet(input_slide_tiles).reset_index().set_index('address') output_header_file = f"{output_dir}/{slide_id}.tiles.parquet" output_hdf_file = f"{output_dir}/{slide_id}.tiles.h5" diff --git a/pyluna-pathology/tests/luna/pathology/cli/test_run_tissue_detection.py b/pyluna-pathology/tests/luna/pathology/cli/test_run_tissue_detection.py index 0ac7d73f..19a63257 100644 --- a/pyluna-pathology/tests/luna/pathology/cli/test_run_tissue_detection.py +++ b/pyluna-pathology/tests/luna/pathology/cli/test_run_tissue_detection.py @@ -8,26 +8,26 @@ def test_otsu(tmp_path): runner = CliRunner() result = runner.invoke(cli, [ 'pyluna-pathology/tests/luna/pathology/cli/testdata/data/123.svs', - 'pyluna-pathology/tests/luna/pathology/cli/testdata/data/generate_tiles/123/123.tiles.csv', + 'pyluna-pathology/tests/luna/pathology/cli/testdata/data/generate_tiles/123/123.tiles.parquet', '-o', tmp_path, '-rmg', 5, '-nc', 1, '-fq', 'otsu_score > 0.5']) assert result.exit_code == 0 - assert os.path.exists(f"{tmp_path}/123-filtered.tiles.csv") + assert os.path.exists(f"{tmp_path}/123-filtered.tiles.parquet") assert os.path.exists(f"{tmp_path}/metadata.yml") def test_stain(tmp_path): runner = CliRunner() result = runner.invoke(cli, [ 'pyluna-pathology/tests/luna/pathology/cli/testdata/data/123.svs', - 'pyluna-pathology/tests/luna/pathology/cli/testdata/data/generate_tiles/123/123.tiles.csv', + 'pyluna-pathology/tests/luna/pathology/cli/testdata/data/generate_tiles/123/123.tiles.parquet', '-o', tmp_path, '-rmg', 5, '-nc', 1, '-fq', 'stain0_score > 0.05']) assert result.exit_code == 0 - assert os.path.exists(f"{tmp_path}/123-filtered.tiles.csv") + assert os.path.exists(f"{tmp_path}/123-filtered.tiles.parquet") assert os.path.exists(f"{tmp_path}/metadata.yml") From e0435e544408f4eb2f98f3b1e7d9451813e1226c Mon Sep 17 00:00:00 2001 From: Aukerman Date: Tue, 5 Apr 2022 16:27:48 -0400 Subject: [PATCH 10/18] Combine ifs --- pyluna-common/luna/common/utils.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pyluna-common/luna/common/utils.py b/pyluna-common/luna/common/utils.py index 9d413d86..19c52c06 100644 --- a/pyluna-common/luna/common/utils.py +++ b/pyluna-common/luna/common/utils.py @@ -489,16 +489,15 @@ def cli_runner( yaml.dump(kwargs, fp) # Save feature data in parquet - if dataset_id is not None: + if dataset_id is not None and "feature_data" in kwargs: - if "feature_data" in kwargs: - feature_data = kwargs.get("feature_data") - logger.info(f"Adding feature segment {feature_data} to {dataset_id}") + feature_data = kwargs.get("feature_data") + logger.info(f"Adding feature segment {feature_data} to {dataset_id}") - dataset_url = get_dataset_url() + dataset_url = get_dataset_url() - if dataset_url is not None: - post_to_dataset( feature_data, dataset_url, dataset_id, keys=kwargs['segment_keys']) + if dataset_url is not None: + post_to_dataset( feature_data, dataset_url, dataset_id, keys=kwargs['segment_keys']) logger.info("Done.") From 0b3efb0fc77c7aebf1c3be1914841d49716f8bd8 Mon Sep 17 00:00:00 2001 From: Aukerman Date: Wed, 6 Apr 2022 12:24:55 -0400 Subject: [PATCH 11/18] Better loggin --- pyluna-common/luna/common/utils.py | 62 +++++++++++++++++------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/pyluna-common/luna/common/utils.py b/pyluna-common/luna/common/utils.py index 19c52c06..05fb5f33 100644 --- a/pyluna-common/luna/common/utils.py +++ b/pyluna-common/luna/common/utils.py @@ -248,7 +248,8 @@ def validate_params(given_params: dict, params_list: List[tuple]): Returns: dict: Validated and casted keyword argument dictonary """ - logger = logging.getLogger(__name__) + logger.info("Validating params...") + d_params = {} for param, dtype in params_list: if given_params.get(param, None) is None: @@ -280,9 +281,9 @@ def validate_params(given_params: dict, params_list: List[tuple]): raise e if param in MASK_KEYS: - logger.info(f"Param {param} set = *****") + logger.info(f" -> Param {param} ({dtype}) set = *****") else: - logger.info(f"Param {param} set = {d_params[param]}") + logger.info(f" -> Param {param} ({dtype}) set = {d_params[param]}") return d_params @@ -299,12 +300,15 @@ def expand_inputs(given_params: dict): d_params = {} d_keys = {} + logger.info("Expanding inputs...") + for param, param_value in given_params.items(): if "input_" in param: # We want to treat input_ params a bit differently # For some inputs, they may be defined as a directory, where metadata about them is at the provided directory path expected_metadata = os.path.join(param_value, "metadata.yml") - print(expected_metadata) + logger.info(f"Attempting to read metadata at {expected_metadata}") + if os.path.isdir(param_value) and os.path.exists( expected_metadata ): # Check for this metadata file @@ -328,7 +332,7 @@ def expand_inputs(given_params: dict): f"No matching output slot of type [{param.replace('input_', '')}] at given input directory" ) - logger.info(f"Expanded input {param_value} -> {expanded_input}") + logger.info(f"Expanded input:\n -> {param_value}\n -> {expanded_input}") d_params[param] = expanded_input # Query any keys: @@ -421,34 +425,37 @@ def cli_runner( None """ - logger.info(f"Running {cli_function} with {cli_kwargs}") - kwargs = {} + logger.info(f"Started CLI Runner wtih {cli_function}") + logger.debug(f"cli_kwargs={cli_kwargs}") + logger.debug(f"cli_params={cli_params}") + logger.debug(f"pass_keys={pass_keys}") + + trm_kwargs = {} + # if "output_dir" not in cli_kwargs.keys(): # raise RuntimeError("CLI Runners assume an output directory") - dataset_id = cli_kwargs.get("dataset_id", None) - # Get params from param file if cli_kwargs.get("method_param_path"): with open(cli_kwargs.get("method_param_path"), "r") as yaml_file: yaml_kwargs = yaml.safe_load(yaml_file) - kwargs.update(yaml_kwargs) # Fill from json + trm_kwargs.update(yaml_kwargs) # Fill from json for key in list(cli_kwargs.keys()): if cli_kwargs[key] is None: del cli_kwargs[key] # Override with CLI arguments - kwargs.update(cli_kwargs) + trm_kwargs.update(cli_kwargs) - kwargs = validate_params(kwargs, cli_params) + trm_kwargs = validate_params(trm_kwargs, cli_params) - if "output_dir" in kwargs: - output_dir = kwargs["output_dir"] + if "output_dir" in trm_kwargs: + output_dir = trm_kwargs["output_dir"] os.makedirs(output_dir, exist_ok=True) # Expand implied inputs - kwargs, keys = expand_inputs(kwargs) + trm_kwargs, keys = expand_inputs(trm_kwargs) logger.info (f"Full segment key set: {keys}") @@ -462,7 +469,7 @@ def cli_runner( with CodeTimer(logger, name=f"transform::{cli_function.__name__}"): if pass_keys: cli_function = partial (cli_function, keys=keys) - result = cli_function(**kwargs) + result = cli_function(**trm_kwargs) # Nice little log break logger.info( @@ -471,33 +478,34 @@ def cli_runner( + "-" * 60 ) - kwargs.update(result) + trm_kwargs.update(result) # filter out kwargs with sensitive data for key in MASK_KEYS: - kwargs.pop(key, None) + trm_kwargs.pop(key, None) # propagate keys - if kwargs.get('segment_keys', None): - kwargs['segment_keys'].update(keys) + if trm_kwargs.get('segment_keys', None): + trm_kwargs['segment_keys'].update(keys) else: - kwargs['segment_keys'] = keys + trm_kwargs['segment_keys'] = keys # Save metadata on disk - if "output_dir" in kwargs: + if "output_dir" in trm_kwargs: with open(os.path.join(output_dir, "metadata.yml"), "w") as fp: - yaml.dump(kwargs, fp) + yaml.dump(trm_kwargs, fp) - # Save feature data in parquet - if dataset_id is not None and "feature_data" in kwargs: + # Save feature data in parquet if indicated: + if "dataset_id" in cli_kwargs and "feature_data" in trm_kwargs: + dataset_id = cli_kwargs.get("dataset_id") + feature_data = trm_kwargs.get("feature_data") - feature_data = kwargs.get("feature_data") logger.info(f"Adding feature segment {feature_data} to {dataset_id}") dataset_url = get_dataset_url() if dataset_url is not None: - post_to_dataset( feature_data, dataset_url, dataset_id, keys=kwargs['segment_keys']) + post_to_dataset( feature_data, dataset_url, dataset_id, keys=trm_kwargs['segment_keys']) logger.info("Done.") From 1e67a1e8a7f05e8b9ebebb102362a69885399163 Mon Sep 17 00:00:00 2001 From: Aukerman Date: Wed, 6 Apr 2022 16:49:12 -0400 Subject: [PATCH 12/18] Set language --- pyluna-common/luna/common/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyluna-common/luna/common/utils.py b/pyluna-common/luna/common/utils.py index 05fb5f33..3cee58eb 100644 --- a/pyluna-common/luna/common/utils.py +++ b/pyluna-common/luna/common/utils.py @@ -281,9 +281,9 @@ def validate_params(given_params: dict, params_list: List[tuple]): raise e if param in MASK_KEYS: - logger.info(f" -> Param {param} ({dtype}) set = *****") + logger.info(f" -> Set {param} ({dtype}) = *****") else: - logger.info(f" -> Param {param} ({dtype}) set = {d_params[param]}") + logger.info(f" -> Set {param} ({dtype}) = {d_params[param]}") return d_params From af5a5ff098bc08f37bfeecab851cb4ac12d6cadf Mon Sep 17 00:00:00 2001 From: Aukerman Date: Wed, 6 Apr 2022 17:07:49 -0400 Subject: [PATCH 13/18] Better detect url scheme --- pyluna-common/luna/common/utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyluna-common/luna/common/utils.py b/pyluna-common/luna/common/utils.py index 3cee58eb..887268af 100644 --- a/pyluna-common/luna/common/utils.py +++ b/pyluna-common/luna/common/utils.py @@ -389,7 +389,9 @@ def post_to_dataset(input_feature_data, waystation_url, dataset_id, keys): post_url = os.path.join ( waystation_url, "datasets", dataset_id, "segments", segment_id ) - if 'http' in post_url: + parsed_url = urllib.parse.urlparse(post_url) + + if 'http' in parsed_url.scheme: # The cool way, using luna waystation logger.info (f"Posting to: {post_url}") @@ -398,10 +400,10 @@ def post_to_dataset(input_feature_data, waystation_url, dataset_id, keys): logger.info (f"{res}: {res.text}") - elif 'file:/' in post_url: + elif 'file' in parsed_url.scheme: # The less cool way, just using file paths - segment_dir = Path ( urllib.parse.urlparse(post_url).path ) + segment_dir = Path ( parsed_url.path ) logger.info (f"Writing to: {segment_dir}") From 2c7fe6bbe38d54dfecaaf227cb8c47100925dfc7 Mon Sep 17 00:00:00 2001 From: Aukerman Date: Wed, 6 Apr 2022 17:09:43 -0400 Subject: [PATCH 14/18] Docs post_to_dataset --- pyluna-common/luna/common/utils.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pyluna-common/luna/common/utils.py b/pyluna-common/luna/common/utils.py index 887268af..eae605be 100644 --- a/pyluna-common/luna/common/utils.py +++ b/pyluna-common/luna/common/utils.py @@ -369,14 +369,13 @@ def get_dataset_url(): def post_to_dataset(input_feature_data, waystation_url, dataset_id, keys): - """ CLI tool method + """ Interface feature data to a parquet dataset Args: - input_data (str): path to input data - output_dir (str): output/working directory - - Returns: - dict: metadata about function call + input_feature_data (str): path to input data + waystation_url (str): URL of dataset root (either file or using waystation) + dataset_id (str): Dataset name/ID + keys (dict): corresponding segment keys """ logger.info(f"Adding {input_feature_data} to {dataset_id} via {waystation_url}") @@ -410,6 +409,9 @@ def post_to_dataset(input_feature_data, waystation_url, dataset_id, keys): os.makedirs(segment_dir, exist_ok=True) shutil.copy(input_feature_data, segment_dir.joinpath("data.parquet")) + + else: + logger.warning("Unrecognized scheme: {parsed_url.scheme}, skipping!") From 25622058dfb6be34fe0eba4b4f670f0d024ee58b Mon Sep 17 00:00:00 2001 From: Aukerman Date: Mon, 11 Apr 2022 10:05:29 -0400 Subject: [PATCH 15/18] documentation --- pyluna-common/luna/common/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyluna-common/luna/common/utils.py b/pyluna-common/luna/common/utils.py index eae605be..98f94123 100644 --- a/pyluna-common/luna/common/utils.py +++ b/pyluna-common/luna/common/utils.py @@ -357,6 +357,7 @@ def expand_inputs(given_params: dict): def get_dataset_url(): + """ Retrieve a "dataset URL" from the environment, may look like http://localhost:6077 or file:///absolute/path/to/dataset/dir """ dataset_url = os.environ.get("DATASET_URL", None) if dataset_url is None: @@ -424,6 +425,7 @@ def cli_runner( cli_kwargs (dict): keyword arguments from the CLI call cli_params (List[tuple]): param list, where each element is the parameter (name, type) cli_function (Callable[..., dict]): cli_function entry point, should accept exactly the arguments given by cli_params + pass_keys (bool): will pass found segment keys to transform function as 'keys' kwarg Returns: None From fcdaf2b50c544e6f759cf8012cc35d48a4c6b574 Mon Sep 17 00:00:00 2001 From: Aukerman Date: Mon, 11 Apr 2022 11:22:46 -0400 Subject: [PATCH 16/18] Last few changes --- .../luna/pathology/analysis/ml.py | 11 +++++---- .../luna/pathology/cli/infer_tile_labels.py | 24 +++++++++++++------ .../cli/visualize_tile_labels_png.py | 2 +- .../luna/pathology/common/utils.py | 2 +- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/pyluna-pathology/luna/pathology/analysis/ml.py b/pyluna-pathology/luna/pathology/analysis/ml.py index 4e7f320c..20e761be 100644 --- a/pyluna-pathology/luna/pathology/analysis/ml.py +++ b/pyluna-pathology/luna/pathology/analysis/ml.py @@ -104,17 +104,20 @@ def __getitem__(self, idx: int): return self.preprocess(img), row.name -def post_transform_to_2d(input: torch.Tensor) -> np.array: +def post_transform_to_2d(input: np.array) -> np.array: """Convert input to a 2D numpy array on CPU Args: input (torch.tensor): tensor input of shape [B, *] where B is the batch dimension """ + if type (input)== torch.tensor: + input = input.cpu.numpy() + if not len(input.shape) == 2: warnings.warn(f"Reshaping model output (was {input.shape}) to 2D") - return input.view(input.shape[0], -1).cpu().numpy() - else: - return input.cpu().numpy() + input = np.reshape(input, (input.shape[0], -1)) + + return input class BaseTorchTileDataset(Dataset): diff --git a/pyluna-pathology/luna/pathology/cli/infer_tile_labels.py b/pyluna-pathology/luna/pathology/cli/infer_tile_labels.py index 46e08ee9..881a053e 100644 --- a/pyluna-pathology/luna/pathology/cli/infer_tile_labels.py +++ b/pyluna-pathology/luna/pathology/cli/infer_tile_labels.py @@ -46,9 +46,11 @@ help="repository name to pull model and weight from, e.g. msk-mind/luna-ml", ) @click.option( - "-tn", "--transform_name", required=False, help="torch hub transform name" + "-mn", + "--model_name", + required=False, + help="torch hub model name", ) -@click.option("-mn", "--model_name", required=False, help="torch hub model name") @click.option( "-kw", "--kwargs", @@ -72,6 +74,12 @@ required=False, help="path to a metadata json/yaml file with method parameters to reproduce results", ) +@click.option( + "-dsid", + "--dataset_id", + required=False, + help="Optional dataset identifier to add results to", +) def cli(**cli_kwargs): """Run a model with a specific pre-transform for all tiles in a slide (tile_images), requires tiles to be saved (save_tiles) first @@ -110,7 +118,6 @@ def infer_tile_labels( input_slide_tiles (str): path to a slide-tile manifest file (.tiles.csv) output_dir (str): output/working directory repo_name (str): repository root name like (namespace/repo) at github.com to serve torch.hub models - transform_name (str): torch hub transform name (a function at the repo repo_name) model_name (str): torch hub model name (a nn.Module at the repo repo_name) weight_tag (str): what weight tag to use num_cores (int): Number of cores to use for CPU parallelization @@ -128,10 +135,12 @@ def infer_tile_labels( logger.info(f"Torch hub source = {source} @ {hub_repo_or_dir}") - ttm = torch.hub.load(hub_repo_or_dir, model_name, source=source, **kwargs) + logger.info(f"Available models: {torch.hub.list(hub_repo_or_dir)}") + + ttm = torch.hub.load(hub_repo_or_dir, model_name, source=source, **kwargs, force_reload=True) if not isinstance(ttm, TorchTransformModel): - raise RuntimeError("Not a valid model!") + raise RuntimeError(f"Not a valid model, loaded model was of type {type(ttm)}") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.info(f"Using device = {device}") @@ -161,8 +170,8 @@ def infer_tile_labels( ) if hasattr(ttm, "class_labels"): - logger.info(f"Mapping column labels -> {ttm.class_labels}") - df_scores = df_scores.rename(columns=ttm.class_labels) + logger.info(f"Mapping column labels -> {ttm.column_labels}") + df_scores = df_scores.rename(columns=ttm.column_labels) df_output = df.join(df_scores) df_output.columns = df_output.columns.astype(str) @@ -177,6 +186,7 @@ def infer_tile_labels( # Save our properties and params properties = { "slide_tiles": output_file, + "feature_data": output_file, "total_tiles": len(df_output), "available_labels": list(df_output.columns), } diff --git a/pyluna-pathology/luna/pathology/cli/visualize_tile_labels_png.py b/pyluna-pathology/luna/pathology/cli/visualize_tile_labels_png.py index f1e32a25..6ec89f12 100644 --- a/pyluna-pathology/luna/pathology/cli/visualize_tile_labels_png.py +++ b/pyluna-pathology/luna/pathology/cli/visualize_tile_labels_png.py @@ -81,7 +81,7 @@ def visualize_tiles(input_slide_image, input_slide_tiles, requested_magnificatio to_mag_scale_factor *= unit_sf # Get tiles - df = pd.read_csv(input_slide_tiles).set_index('address') + df = pd.read_parquet(input_slide_tiles).reset_index().set_index('address') # only visualize tile scores that were able to be computed all_score_types = set(plot_labels) diff --git a/pyluna-pathology/luna/pathology/common/utils.py b/pyluna-pathology/luna/pathology/common/utils.py index 3e7f857d..02bde573 100644 --- a/pyluna-pathology/luna/pathology/common/utils.py +++ b/pyluna-pathology/luna/pathology/common/utils.py @@ -443,7 +443,7 @@ def visualize_tiling_scores(df:pd.DataFrame, thumbnail_img:np.ndarray, scale_fac assert isinstance(thumbnail_img, np.ndarray) - if normalize: + if normalize and df[score_type_to_visualize].dtype.kind in 'biuf': df[score_type_to_visualize] = (df[score_type_to_visualize] - np.min(df[score_type_to_visualize]))/np.ptp(df[score_type_to_visualize]) for _, row in tqdm(df.iterrows(), total=len(df)): From 77f40a19c885806479dee49aaed38462b175724e Mon Sep 17 00:00:00 2001 From: Aukerman Date: Mon, 11 Apr 2022 11:38:02 -0400 Subject: [PATCH 17/18] Fix tests --- pyluna-pathology/luna/pathology/cli/infer_tile_labels.py | 5 +++-- .../luna/pathology/cli/testdata/data/testhub/hubconf.py | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pyluna-pathology/luna/pathology/cli/infer_tile_labels.py b/pyluna-pathology/luna/pathology/cli/infer_tile_labels.py index 881a053e..c1602714 100644 --- a/pyluna-pathology/luna/pathology/cli/infer_tile_labels.py +++ b/pyluna-pathology/luna/pathology/cli/infer_tile_labels.py @@ -135,7 +135,8 @@ def infer_tile_labels( logger.info(f"Torch hub source = {source} @ {hub_repo_or_dir}") - logger.info(f"Available models: {torch.hub.list(hub_repo_or_dir)}") + if source == "github": + logger.info(f"Available models: {torch.hub.list(hub_repo_or_dir)}") ttm = torch.hub.load(hub_repo_or_dir, model_name, source=source, **kwargs, force_reload=True) @@ -169,7 +170,7 @@ def infer_tile_labels( ] ) - if hasattr(ttm, "class_labels"): + if hasattr(ttm, "column_labels"): logger.info(f"Mapping column labels -> {ttm.column_labels}") df_scores = df_scores.rename(columns=ttm.column_labels) diff --git a/pyluna-pathology/tests/luna/pathology/cli/testdata/data/testhub/hubconf.py b/pyluna-pathology/tests/luna/pathology/cli/testdata/data/testhub/hubconf.py index 29025edb..ffd4e2ce 100644 --- a/pyluna-pathology/tests/luna/pathology/cli/testdata/data/testhub/hubconf.py +++ b/pyluna-pathology/tests/luna/pathology/cli/testdata/data/testhub/hubconf.py @@ -13,14 +13,14 @@ def __init__(self, n_channels): ) if n_channels == 2: - self.class_labels = {0: "Background", 1: "Tumor"} + self.column_labels = {0: "Background", 1: "Tumor"} def get_preprocess(self): return self.preprocess def transform(self, X): X = X.permute(0, 3, 1, 2).float() / 255 - out = self.model(X).view(X.shape[0], -1) + out = self.model(X).view(X.shape[0], -1).cpu().numpy() return out @@ -38,7 +38,7 @@ def get_preprocess(self): def transform(self, X): X = X.permute(0, 3, 1, 2).float() / 255 - out = self.model(X).view(X.shape[0], -1) + out = self.model(X).view(X.shape[0], -1).cpu().numpy() return out From c4c3f1a50f388b8695d1e000f84eae6b8a2ccc6e Mon Sep 17 00:00:00 2001 From: Aukerman Date: Mon, 11 Apr 2022 11:41:21 -0400 Subject: [PATCH 18/18] PNG parquet fix --- .../luna/pathology/cli/test_visualize_tile_labels_png.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyluna-pathology/tests/luna/pathology/cli/test_visualize_tile_labels_png.py b/pyluna-pathology/tests/luna/pathology/cli/test_visualize_tile_labels_png.py index b8efaea8..00fada45 100644 --- a/pyluna-pathology/tests/luna/pathology/cli/test_visualize_tile_labels_png.py +++ b/pyluna-pathology/tests/luna/pathology/cli/test_visualize_tile_labels_png.py @@ -10,12 +10,12 @@ def test_viz(tmp_path): df = pd.read_csv("pyluna-pathology/tests/luna/pathology/cli/testdata/data/generate_tiles/123/123.tiles.csv") df['random'] = np.random.rand(len(df)) - df.to_csv(f"{tmp_path}/input_tiles.csv") + df.to_parquet(f"{tmp_path}/input_tiles.parquet") runner = CliRunner() result = runner.invoke(cli, [ 'pyluna-pathology/tests/luna/pathology/cli/testdata/data/123.svs', - f"{tmp_path}/input_tiles.csv", + f"{tmp_path}/input_tiles.parquet", '-o', tmp_path, '-pl', 'random', '-rmg', 5])