diff --git a/.gitignore b/.gitignore index 0de5822..de22cfd 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ # ignore pycache __pycache__ +# ignore corrected plates if they are put into a separate folder +Corrected_Images + # ignore corrected plates images (like in download data) 1.cellprofiler_ic/Corrected_Plate_1 1.cellprofiler_ic/Corrected_Plate_2 @@ -17,6 +20,7 @@ __pycache__ # ignore log files *logs +*.logs # ignore Mac related files .DS_Store diff --git a/1.cellprofiler_ic/NF1_illum_Plates_1_2.cppipe b/1.cellprofiler_ic/NF1_illum_3channel.cppipe similarity index 88% rename from 1.cellprofiler_ic/NF1_illum_Plates_1_2.cppipe rename to 1.cellprofiler_ic/NF1_illum_3channel.cppipe index 81f7a3d..ff2038e 100644 --- a/1.cellprofiler_ic/NF1_illum_Plates_1_2.cppipe +++ b/1.cellprofiler_ic/NF1_illum_3channel.cppipe @@ -5,12 +5,12 @@ GitHash: ModuleCount:11 HasImagePlaneDetails:False -Images:[module_num:1|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['To begin creating your project, use the Images module to compile a list of files and/or folders that you want to analyze. You can also specify a set of rules to include only the desired files in your selected folders.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +Images:[module_num:1|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['Images module is blank since we are adding the path to the images in the CLI']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] : Filter images?:Images only Select the rule criteria:and (extension does isimage) (directory doesnot containregexp "[\\\\/]\\.") -Metadata:[module_num:2|svn_version:'Unknown'|variable_revision_number:6|show_window:False|notes:['The Metadata module optionally allows you to extract information describing your images (i.e, metadata) which will be stored along with your measurements. This information can be contained in the file name and/or location, or in an external file.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +Metadata:[module_num:2|svn_version:'Unknown'|variable_revision_number:6|show_window:False|notes:['No metadata is extracted during illumination correction']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Extract metadata?:No Metadata data type:Text Metadata types:{} @@ -27,7 +27,7 @@ Metadata:[module_num:2|svn_version:'Unknown'|variable_revision_number:6|show_win Metadata file name:None Does cached metadata exist?:No -NamesAndTypes:[module_num:3|svn_version:'Unknown'|variable_revision_number:8|show_window:False|notes:['The NamesAndTypes module allows you to assign a meaningful name to each image by which other modules will refer to it.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +NamesAndTypes:[module_num:3|svn_version:'Unknown'|variable_revision_number:8|show_window:False|notes:['In this pipeline, we only have three channels to correct:', '', 'DAPI', 'GFP', 'RFP']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Assign a name to:Images matching rules Select the image type:Grayscale image Name to assign these images:DNA @@ -60,7 +60,7 @@ NamesAndTypes:[module_num:3|svn_version:'Unknown'|variable_revision_number:8|sho Set intensity range from:Image metadata Maximum intensity:255.0 -Groups:[module_num:4|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['The Groups module optionally allows you to split your list of images into image subsets (groups) which will be processed independently of each other. Examples of groupings include screening batches, microtiter plates, time-lapse movies, etc.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +Groups:[module_num:4|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['We do not use the Groups module']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Do you want to group your images?:No grouping metadata count:1 Metadata category:None @@ -156,7 +156,7 @@ CorrectIlluminationApply:[module_num:8|svn_version:'Unknown'|variable_revision_n Set output image values less than 0 equal to 0?:Yes Set output image values greater than 1 equal to 1?:Yes -SaveImages:[module_num:9|svn_version:'Unknown'|variable_revision_number:16|show_window:False|notes:['Save DAPI images as same tiff bit depth']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +SaveImages:[module_num:9|svn_version:'Unknown'|variable_revision_number:16|show_window:False|notes:['Save DAPI images as same bit depth']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Select the type of image to save:Image Select the image to save:CorrDAPI Select method for constructing file names:From image filename diff --git a/1.cellprofiler_ic/NF1_illum_Plate3_Plate3prime.cppipe b/1.cellprofiler_ic/NF1_illum_4channel.cppipe similarity index 90% rename from 1.cellprofiler_ic/NF1_illum_Plate3_Plate3prime.cppipe rename to 1.cellprofiler_ic/NF1_illum_4channel.cppipe index 4b46540..4b3cc6e 100644 --- a/1.cellprofiler_ic/NF1_illum_Plate3_Plate3prime.cppipe +++ b/1.cellprofiler_ic/NF1_illum_4channel.cppipe @@ -5,12 +5,12 @@ GitHash: ModuleCount:13 HasImagePlaneDetails:False -Images:[module_num:1|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['To begin creating your project, use the Images module to compile a list of files and/or folders that you want to analyze. You can also specify a set of rules to include only the desired files in your selected folders.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +Images:[module_num:1|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['Images module is blank in the GUI since we are adding the image path in CLI.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] : Filter images?:Images only Select the rule criteria:and (extension does isimage) (directory doesnot containregexp "[\\\\/]\\.") -Metadata:[module_num:2|svn_version:'Unknown'|variable_revision_number:6|show_window:False|notes:['The Metadata module optionally allows you to extract information describing your images (i.e, metadata) which will be stored along with your measurements. This information can be contained in the file name and/or location, or in an external file.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +Metadata:[module_num:2|svn_version:'Unknown'|variable_revision_number:6|show_window:False|notes:['We do not extract metadata when performing illumination correction']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Extract metadata?:No Metadata data type:Text Metadata types:{} @@ -27,7 +27,7 @@ Metadata:[module_num:2|svn_version:'Unknown'|variable_revision_number:6|show_win Metadata file name:None Does cached metadata exist?:No -NamesAndTypes:[module_num:3|svn_version:'Unknown'|variable_revision_number:8|show_window:False|notes:['The NamesAndTypes module allows you to assign a meaningful name to each image by which other modules will refer to it.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +NamesAndTypes:[module_num:3|svn_version:'Unknown'|variable_revision_number:8|show_window:False|notes:['This pipeline is meant for correction with 4 channels specifically:', '', 'DAPI', 'GFP', 'RFP', 'CY5', '']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Assign a name to:Images matching rules Select the image type:Grayscale image Name to assign these images:DNA @@ -66,7 +66,7 @@ NamesAndTypes:[module_num:3|svn_version:'Unknown'|variable_revision_number:8|sho Set intensity range from:Image metadata Maximum intensity:255.0 -Groups:[module_num:4|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['The Groups module optionally allows you to split your list of images into image subsets (groups) which will be processed independently of each other. Examples of groupings include screening batches, microtiter plates, time-lapse movies, etc.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +Groups:[module_num:4|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['We do not use the Groups module.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Do you want to group your images?:No grouping metadata count:1 Metadata category:None @@ -171,7 +171,7 @@ CorrectIlluminationCalculate:[module_num:8|svn_version:'Unknown'|variable_revisi Maximum number of iterations:40 Residual value for convergence:0.001 -CorrectIlluminationApply:[module_num:9|svn_version:'Unknown'|variable_revision_number:5|show_window:False|notes:['Apply illumination correction functions to all channel groups']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +CorrectIlluminationApply:[module_num:9|svn_version:'Unknown'|variable_revision_number:5|show_window:False|notes:['Apply illumination correction functions to all channels']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Select the input image:OrigDAPI Name the output image:CorrDAPI Select the illumination function:IllumDAPI @@ -251,7 +251,7 @@ SaveImages:[module_num:12|svn_version:'Unknown'|variable_revision_number:16|show How to save the series:T (Time) Save with lossless compression?:Yes -SaveImages:[module_num:13|svn_version:'Unknown'|variable_revision_number:16|show_window:False|notes:['Save CY5 images as the same bit-deph']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +SaveImages:[module_num:13|svn_version:'Unknown'|variable_revision_number:16|show_window:False|notes:['Save CY5 images as the same bit-depth']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Select the type of image to save:Image Select the image to save:CorrMito Select method for constructing file names:From image filename diff --git a/1.cellprofiler_ic/README.md b/1.cellprofiler_ic/README.md index 7ae411f..709f250 100644 --- a/1.cellprofiler_ic/README.md +++ b/1.cellprofiler_ic/README.md @@ -3,6 +3,16 @@ In this module, we perform illumination correction (IC) on images for each plate and save the corrected images into new folders. Images are saved as 16-bit depth, which is the same as the raw data. +## CellProfiler Pipeline + +Due to there being differences in channel number between plates, we have two different illumination correction pipelines. + +1. [3 channel pipeline](./NF1_illum_3channel.cppipe) -> This pipeline is used only with Plates 1 and 2 since these first pilot plates only had 3 channels. +2. [4 channel pipeline](./NF1_illum_4channel.cppipe) -> This pipeline is used for all the rest of the plates (and future plates) as we have established the protocol for staining. + +**Note:** The parameters for correction between pipelines in the same channel might be slightly different. +But, the parameters in both perform and output the best corrected images at this point + ## Run the `nf1_ic` notebook To calculate and apply an IC function on each channel, run the [nf1_ic.ipynb](nf1_ic.ipynb) notebook as a python script using the code block below: @@ -15,6 +25,20 @@ cd 1.cellprofiler_ic source nf1_ic.sh ``` +## CellProfiler Parallel + +To improve the speed for correcting the images, we have implemented `CellProfiler Parallel`, which utilizes multi-processing to run one plate per CPU core. + +### Speed when running CellProfiler Parallel + +To run **five plates** through illumination correction, it took approximately **1 hour** in total on a computer using Pop_OS! 22.04 LTS with an AMD Ryzen 7 3700X 8-Core Processor. + +This means that we are saving 2 hours (assuming the fifth plate running sequentially would take another hour totalling 3 hours) when running five plates. + +### Speed when running CellProfiler sequentially + +In the past, we ran one command per plate in seqential order (e.g., one plate is ran and once it finishes the next plate starts). + For four plates, it took about 2 hours to calculate, apply, and save illumination corrected images on a computer using Pop_OS! 22.04 LTS with an AMD Ryzen 7 3700X 8-Core Processor. For three plates, it took about 18 hours to calculate and save illumination corrected images on a computer using MacOS Ventura 13.2.1 with the M2 chip. diff --git a/1.cellprofiler_ic/nf1_ic.ipynb b/1.cellprofiler_ic/nf1_ic.ipynb index 64285fc..5bf0b42 100644 --- a/1.cellprofiler_ic/nf1_ic.ipynb +++ b/1.cellprofiler_ic/nf1_ic.ipynb @@ -5,7 +5,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Correct illumination and save images for each plate using CellProfiler" + "# Correct illumination and save images for each plate using CellProfiler Parallel" ] }, { @@ -22,11 +22,13 @@ "metadata": {}, "outputs": [], "source": [ - "import sys\n", "import pathlib\n", + "import pprint\n", + "\n", + "import sys\n", "\n", - "sys.path.append(\"../\")\n", - "from utils import cp_utils" + "sys.path.append(\"../utils\")\n", + "import cp_parallel" ] }, { @@ -34,9 +36,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Set paths for each plate\n", - "\n", - "Note: Output file path does not need to be strict since the `run_cellprofiler` function can create the output folder directory if it doesn't already exist. The other paths must be strict since these files should already exist for CellProfiler to run. The output directory doesn't need to already exist." + "## Set paths and variables" ] }, { @@ -45,48 +45,88 @@ "metadata": {}, "outputs": [], "source": [ - "plates_info_dictionary = {\n", - " \"Plate_1\": {\n", - " # this pipeline is specific to plates 1 and 2 (due to channel difference from plates 3 and 3 prime)\n", - " \"path_to_pipeline\": pathlib.Path(\"NF1_illum_Plates_1_2.cppipe\").resolve(\n", - " strict=True\n", - " ),\n", - " \"path_to_images\": pathlib.Path(\"../0.download_data/Plate_1/\").resolve(\n", - " strict=True\n", - " ),\n", - " \"path_to_output\": pathlib.Path(\"Corrected_Plate_1\").resolve(),\n", - " },\n", - " \"Plate_2\": {\n", - " # this pipeline is specific to plates 1 and 2 (due to channel difference from plates 3 and 3 prime)\n", - " \"path_to_pipeline\": pathlib.Path(\"NF1_illum_Plates_1_2.cppipe\").resolve(\n", - " strict=True\n", - " ),\n", - " \"path_to_images\": pathlib.Path(\"../0.download_data/Plate_2/\").resolve(\n", - " strict=True\n", - " ),\n", - " \"path_to_output\": pathlib.Path(\"Corrected_Plate_2\").resolve(),\n", - " },\n", - " \"Plate_3\": {\n", - " # this pipeline is specific to plates 3 and 3 prime (due to channel difference from plates 1 and 2)\n", - " \"path_to_pipeline\": pathlib.Path(\"NF1_illum_Plate3_Plate3prime.cppipe\").resolve(\n", - " strict=True\n", - " ),\n", - " \"path_to_images\": pathlib.Path(\"../0.download_data/Plate_3/\").resolve(\n", + "# set the run type for the parallelization\n", + "run_name = \"illum_correction\"\n", + "\n", + "# set path for pipeline for all plates for whole image analysis\n", + "path_to_pipeline = pathlib.Path(\"./pipelines/nuclei_analysis.cppipe\").resolve()\n", + "\n", + "# set main output dir for all plates\n", + "output_dir = pathlib.Path(\"./Corrected_Images\")\n", + "output_dir.mkdir(exist_ok=True)\n", + "\n", + "# directory where images are located within folders\n", + "images_dir = pathlib.Path(\"../0.download_data/\")\n", + "\n", + "# list for plate names based on folders to use to create dictionary\n", + "plate_names = []\n", + "# iterate through 0.download_data and append plate names from folder names that contain image data from that plate\n", + "for file_path in pathlib.Path(\"../0.download_data/\").iterdir():\n", + " if str(file_path.stem).startswith(\"Plate\"):\n", + " plate_names.append(str(file_path.stem))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create dictionary with all info for each plate" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{ 'Plate_1': { 'path_to_images': PosixPath('/home/jenna/nf1_cellpainting_data/0.download_data/Plate_1'),\n", + " 'path_to_output': PosixPath('Corrected_Images/Corrected_Plate_1'),\n", + " 'path_to_pipeline': PosixPath('/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/NF1_illum_3channel.cppipe')},\n", + " 'Plate_2': { 'path_to_images': PosixPath('/home/jenna/nf1_cellpainting_data/0.download_data/Plate_2'),\n", + " 'path_to_output': PosixPath('Corrected_Images/Corrected_Plate_2'),\n", + " 'path_to_pipeline': PosixPath('/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/NF1_illum_3channel.cppipe')},\n", + " 'Plate_3': { 'path_to_images': PosixPath('/home/jenna/nf1_cellpainting_data/0.download_data/Plate_3'),\n", + " 'path_to_output': PosixPath('Corrected_Images/Corrected_Plate_3'),\n", + " 'path_to_pipeline': PosixPath('/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/NF1_illum_4channel.cppipe')},\n", + " 'Plate_3_prime': { 'path_to_images': PosixPath('/home/jenna/nf1_cellpainting_data/0.download_data/Plate_3_prime'),\n", + " 'path_to_output': PosixPath('Corrected_Images/Corrected_Plate_3_prime'),\n", + " 'path_to_pipeline': PosixPath('/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/NF1_illum_4channel.cppipe')},\n", + " 'Plate_4': { 'path_to_images': PosixPath('/home/jenna/nf1_cellpainting_data/0.download_data/Plate_4'),\n", + " 'path_to_output': PosixPath('Corrected_Images/Corrected_Plate_4'),\n", + " 'path_to_pipeline': PosixPath('/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/NF1_illum_4channel.cppipe')}}\n" + ] + } + ], + "source": [ + "# create plate info dictionary with all parts of the CellProfiler CLI command to run in parallel\n", + "plate_info_dictionary = {\n", + " name: {\n", + " \"path_to_images\": pathlib.Path(list(images_dir.rglob(name))[0]).resolve(\n", " strict=True\n", " ),\n", - " \"path_to_output\": pathlib.Path(\"Corrected_Plate_3\").resolve(),\n", - " },\n", - " \"Plate_3_prime\": {\n", - " # this pipeline is specific to plates 3 and 3 prime (due to channel difference from plates 1 and 2)\n", - " \"path_to_pipeline\": pathlib.Path(\"NF1_illum_Plate3_Plate3prime.cppipe\").resolve(\n", + " \"path_to_output\": pathlib.Path(f\"{output_dir}/Corrected_{name}\"),\n", + " }\n", + " for name in plate_names\n", + "}\n", + "\n", + "# iterate over the dictionary and add the path_to_pipeline specific for each plate\n", + "for name, info in plate_info_dictionary.items():\n", + " # only plates 1 and 2 have 3 channels so these are the only plates that use this path\n", + " if name == \"Plate_1\" or name == \"Plate_2\":\n", + " info[\"path_to_pipeline\"] = pathlib.Path(f\"./NF1_illum_3channel.cppipe\").resolve(\n", " strict=True\n", - " ),\n", - " \"path_to_images\": pathlib.Path(\"../0.download_data/Plate_3_prime/\").resolve(\n", + " )\n", + " # all other plates have 4 channels and will use that specific pipeline\n", + " else:\n", + " info[\"path_to_pipeline\"] = pathlib.Path(f\"./NF1_illum_4channel.cppipe\").resolve(\n", " strict=True\n", - " ),\n", - " \"path_to_output\": pathlib.Path(\"Corrected_Plate_3_prime\").resolve(),\n", - " },\n", - "}" + " )\n", + "\n", + "# view the dictionary to assess that all info is added correctly\n", + "pprint.pprint(plate_info_dictionary, indent=4)" ] }, { @@ -94,30 +134,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Run illumination correction pipeline on each plate\n", + "## Run illumination correction pipeline on each plate in parallel\n", "\n", "In this notebook, we do not run the cells to completion as we prefer to run the notebooks as nbconverted python files due to better stability." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Correcting Plate_1\n", - "Starting CellProfiler run on Plate_1\n", - "The CellProfiler run has been completed with Plate_1. Please check log file for any errors.\n", - "Correcting Plate_2\n", - "Starting CellProfiler run on Plate_2\n", - "The CellProfiler run has been completed with Plate_2. Please check log file for any errors.\n", - "Correcting Plate_3\n", - "Starting CellProfiler run on Plate_3\n" - ] - }, { "ename": "KeyboardInterrupt", "evalue": "", @@ -125,36 +151,19 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[3], line 9\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mCorrecting \u001b[39m\u001b[39m{\u001b[39;00mplate\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 8\u001b[0m \u001b[39m# run illumination correction pipeline and save images\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m cp_utils\u001b[39m.\u001b[39;49mrun_cellprofiler(\n\u001b[1;32m 10\u001b[0m path_to_pipeline\u001b[39m=\u001b[39;49mpath_to_pipeline,\n\u001b[1;32m 11\u001b[0m path_to_output\u001b[39m=\u001b[39;49mpath_to_output,\n\u001b[1;32m 12\u001b[0m path_to_images\u001b[39m=\u001b[39;49mpath_to_images,\n\u001b[1;32m 13\u001b[0m \u001b[39m# these variables are turned off for illum pipeline\u001b[39;49;00m\n\u001b[1;32m 14\u001b[0m sqlite_name\u001b[39m=\u001b[39;49m\u001b[39mNone\u001b[39;49;00m,\n\u001b[1;32m 15\u001b[0m analysis_run\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 16\u001b[0m )\n", - "File \u001b[0;32m~/Desktop/GitHub_Repos/nf1_cellpainting_data/1.cellprofiler_ic/../utils/cp_utils.py:98\u001b[0m, in \u001b[0;36mrun_cellprofiler\u001b[0;34m(path_to_pipeline, path_to_output, path_to_images, sqlite_name, analysis_run)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\n\u001b[1;32m 81\u001b[0m pathlib\u001b[39m.\u001b[39mPath(\n\u001b[1;32m 82\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mlogs/cellprofiler_output_\u001b[39m\u001b[39m{\u001b[39;00mpathlib\u001b[39m.\u001b[39mPath(path_to_images)\u001b[39m.\u001b[39mname\u001b[39m}\u001b[39;00m\u001b[39m.log\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 85\u001b[0m ) \u001b[39mas\u001b[39;00m cellprofiler_output_file:\n\u001b[1;32m 86\u001b[0m \u001b[39m# run CellProfiler for a illumination correction pipeline\u001b[39;00m\n\u001b[1;32m 87\u001b[0m command \u001b[39m=\u001b[39m [\n\u001b[1;32m 88\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mcellprofiler\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 89\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m-c\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 96\u001b[0m path_to_images,\n\u001b[1;32m 97\u001b[0m ]\n\u001b[0;32m---> 98\u001b[0m subprocess\u001b[39m.\u001b[39;49mrun(\n\u001b[1;32m 99\u001b[0m command,\n\u001b[1;32m 100\u001b[0m stdout\u001b[39m=\u001b[39;49mcellprofiler_output_file,\n\u001b[1;32m 101\u001b[0m stderr\u001b[39m=\u001b[39;49mcellprofiler_output_file,\n\u001b[1;32m 102\u001b[0m check\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m,\n\u001b[1;32m 103\u001b[0m )\n\u001b[1;32m 104\u001b[0m \u001b[39mprint\u001b[39m(\n\u001b[1;32m 105\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mThe CellProfiler run has been completed with \u001b[39m\u001b[39m{\u001b[39;00mpathlib\u001b[39m.\u001b[39mPath(path_to_images)\u001b[39m.\u001b[39mname\u001b[39m}\u001b[39;00m\u001b[39m. Please check log file for any errors.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 106\u001b[0m )\n\u001b[1;32m 108\u001b[0m \u001b[39m# run CellProfiler analysis pipeline\u001b[39;00m\n", - "File \u001b[0;32m~/opt/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/subprocess.py:495\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m 493\u001b[0m \u001b[39mwith\u001b[39;00m Popen(\u001b[39m*\u001b[39mpopenargs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs) \u001b[39mas\u001b[39;00m process:\n\u001b[1;32m 494\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 495\u001b[0m stdout, stderr \u001b[39m=\u001b[39m process\u001b[39m.\u001b[39;49mcommunicate(\u001b[39minput\u001b[39;49m, timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[1;32m 496\u001b[0m \u001b[39mexcept\u001b[39;00m TimeoutExpired \u001b[39mas\u001b[39;00m exc:\n\u001b[1;32m 497\u001b[0m process\u001b[39m.\u001b[39mkill()\n", - "File \u001b[0;32m~/opt/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/subprocess.py:1020\u001b[0m, in \u001b[0;36mPopen.communicate\u001b[0;34m(self, input, timeout)\u001b[0m\n\u001b[1;32m 1018\u001b[0m stderr \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstderr\u001b[39m.\u001b[39mread()\n\u001b[1;32m 1019\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstderr\u001b[39m.\u001b[39mclose()\n\u001b[0;32m-> 1020\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mwait()\n\u001b[1;32m 1021\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1022\u001b[0m \u001b[39mif\u001b[39;00m timeout \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/opt/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/subprocess.py:1083\u001b[0m, in \u001b[0;36mPopen.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1081\u001b[0m endtime \u001b[39m=\u001b[39m _time() \u001b[39m+\u001b[39m timeout\n\u001b[1;32m 1082\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 1083\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_wait(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[1;32m 1084\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m 1085\u001b[0m \u001b[39m# https://bugs.python.org/issue25942\u001b[39;00m\n\u001b[1;32m 1086\u001b[0m \u001b[39m# The first keyboard interrupt waits briefly for the child to\u001b[39;00m\n\u001b[1;32m 1087\u001b[0m \u001b[39m# exit under the common assumption that it also received the ^C\u001b[39;00m\n\u001b[1;32m 1088\u001b[0m \u001b[39m# generated SIGINT and will exit rapidly.\u001b[39;00m\n\u001b[1;32m 1089\u001b[0m \u001b[39mif\u001b[39;00m timeout \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/opt/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/subprocess.py:1806\u001b[0m, in \u001b[0;36mPopen._wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1804\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturncode \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 1805\u001b[0m \u001b[39mbreak\u001b[39;00m \u001b[39m# Another thread waited.\u001b[39;00m\n\u001b[0;32m-> 1806\u001b[0m (pid, sts) \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_try_wait(\u001b[39m0\u001b[39;49m)\n\u001b[1;32m 1807\u001b[0m \u001b[39m# Check the pid and loop as waitpid has been known to\u001b[39;00m\n\u001b[1;32m 1808\u001b[0m \u001b[39m# return 0 even without WNOHANG in odd situations.\u001b[39;00m\n\u001b[1;32m 1809\u001b[0m \u001b[39m# http://bugs.python.org/issue14396.\u001b[39;00m\n\u001b[1;32m 1810\u001b[0m \u001b[39mif\u001b[39;00m pid \u001b[39m==\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpid:\n", - "File \u001b[0;32m~/opt/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/subprocess.py:1764\u001b[0m, in \u001b[0;36mPopen._try_wait\u001b[0;34m(self, wait_flags)\u001b[0m\n\u001b[1;32m 1762\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"All callers to this function MUST hold self._waitpid_lock.\"\"\"\u001b[39;00m\n\u001b[1;32m 1763\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 1764\u001b[0m (pid, sts) \u001b[39m=\u001b[39m os\u001b[39m.\u001b[39;49mwaitpid(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpid, wait_flags)\n\u001b[1;32m 1765\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mChildProcessError\u001b[39;00m:\n\u001b[1;32m 1766\u001b[0m \u001b[39m# This happens if SIGCLD is set to be ignored or waiting\u001b[39;00m\n\u001b[1;32m 1767\u001b[0m \u001b[39m# for child processes has otherwise been disabled for our\u001b[39;00m\n\u001b[1;32m 1768\u001b[0m \u001b[39m# process. This child is dead, we can't get the status.\u001b[39;00m\n\u001b[1;32m 1769\u001b[0m pid \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpid\n", + "Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cp_parallel\u001b[39m.\u001b[39;49mrun_cellprofiler_parallel(\n\u001b[1;32m 2\u001b[0m plate_info_dictionary\u001b[39m=\u001b[39;49mplate_info_dictionary, run_name\u001b[39m=\u001b[39;49mrun_name\n\u001b[1;32m 3\u001b[0m )\n", + "File \u001b[0;32m~/nf1_cellpainting_data/1.cellprofiler_ic/../utils/cp_parallel.py:119\u001b[0m, in \u001b[0;36mrun_cellprofiler_parallel\u001b[0;34m(plate_info_dictionary, run_name)\u001b[0m\n\u001b[1;32m 109\u001b[0m futures: List[Future] \u001b[39m=\u001b[39m [\n\u001b[1;32m 110\u001b[0m executor\u001b[39m.\u001b[39msubmit(\n\u001b[1;32m 111\u001b[0m subprocess\u001b[39m.\u001b[39mrun,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[39mfor\u001b[39;00m command \u001b[39min\u001b[39;00m commands\n\u001b[1;32m 116\u001b[0m ]\n\u001b[1;32m 118\u001b[0m \u001b[39m# the list of CompletedProcesses holds all the information from the CellProfiler run\u001b[39;00m\n\u001b[0;32m--> 119\u001b[0m results: List[subprocess\u001b[39m.\u001b[39mCompletedProcess] \u001b[39m=\u001b[39m [future\u001b[39m.\u001b[39mresult() \u001b[39mfor\u001b[39;00m future \u001b[39min\u001b[39;00m futures]\n\u001b[1;32m 121\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mAll processes have been completed!\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 123\u001b[0m \u001b[39m# for each process, confirm that the process completed succesfully and return a log file\u001b[39;00m\n", + "File \u001b[0;32m~/nf1_cellpainting_data/1.cellprofiler_ic/../utils/cp_parallel.py:119\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 109\u001b[0m futures: List[Future] \u001b[39m=\u001b[39m [\n\u001b[1;32m 110\u001b[0m executor\u001b[39m.\u001b[39msubmit(\n\u001b[1;32m 111\u001b[0m subprocess\u001b[39m.\u001b[39mrun,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[39mfor\u001b[39;00m command \u001b[39min\u001b[39;00m commands\n\u001b[1;32m 116\u001b[0m ]\n\u001b[1;32m 118\u001b[0m \u001b[39m# the list of CompletedProcesses holds all the information from the CellProfiler run\u001b[39;00m\n\u001b[0;32m--> 119\u001b[0m results: List[subprocess\u001b[39m.\u001b[39mCompletedProcess] \u001b[39m=\u001b[39m [future\u001b[39m.\u001b[39;49mresult() \u001b[39mfor\u001b[39;00m future \u001b[39min\u001b[39;00m futures]\n\u001b[1;32m 121\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mAll processes have been completed!\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 123\u001b[0m \u001b[39m# for each process, confirm that the process completed succesfully and return a log file\u001b[39;00m\n", + "File \u001b[0;32m~/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/concurrent/futures/_base.py:439\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_state \u001b[39m==\u001b[39m FINISHED:\n\u001b[1;32m 437\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m__get_result()\n\u001b[0;32m--> 439\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_condition\u001b[39m.\u001b[39;49mwait(timeout)\n\u001b[1;32m 441\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_state \u001b[39min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n\u001b[1;32m 442\u001b[0m \u001b[39mraise\u001b[39;00m CancelledError()\n", + "File \u001b[0;32m~/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/threading.py:302\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[39mtry\u001b[39;00m: \u001b[39m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m 301\u001b[0m \u001b[39mif\u001b[39;00m timeout \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 302\u001b[0m waiter\u001b[39m.\u001b[39;49macquire()\n\u001b[1;32m 303\u001b[0m gotit \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m 304\u001b[0m \u001b[39melse\u001b[39;00m:\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ - "# run through each plate with each set of paths based on dictionary\n", - "for plate, info in plates_info_dictionary.items():\n", - " path_to_pipeline = info[\"path_to_pipeline\"]\n", - " path_to_output = info[\"path_to_output\"]\n", - " # make output dir if needed\n", - " path_to_output.mkdir(exist_ok=True)\n", - " path_to_images = info[\"path_to_images\"]\n", - " print(f\"Correcting {plate}\")\n", - "\n", - " # run illumination correction pipeline and save images\n", - " cp_utils.run_cellprofiler(\n", - " path_to_pipeline=path_to_pipeline,\n", - " path_to_output=path_to_output,\n", - " path_to_images=path_to_images,\n", - " # these variables are turned off for illum pipeline\n", - " sqlite_name=None,\n", - " analysis_run=False,\n", - " )" + "cp_parallel.run_cellprofiler_parallel(\n", + " plate_info_dictionary=plate_info_dictionary, run_name=run_name\n", + ")" ] } ], diff --git a/1.cellprofiler_ic/nf1_ic.sh b/1.cellprofiler_ic/nf1_ic.sh index 1161272..6119364 100644 --- a/1.cellprofiler_ic/nf1_ic.sh +++ b/1.cellprofiler_ic/nf1_ic.sh @@ -1,8 +1,8 @@ #!/bin/bash -# this line is needed for the sh file to properly activate a conda environment -eval "$(conda shell.bash hook)" -# activate the main conda environment (if not already activated) +# initialize the correct shell for your machine to allow conda to work (see README for note on shell names) +conda init bash +# activate the main conda environment conda activate nf1_cellpainting_data # convert the notebook into a python and run the file diff --git a/1.cellprofiler_ic/scripts/nf1_ic.py b/1.cellprofiler_ic/scripts/nf1_ic.py index 236ed01..2658686 100644 --- a/1.cellprofiler_ic/scripts/nf1_ic.py +++ b/1.cellprofiler_ic/scripts/nf1_ic.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # coding: utf-8 -# # Correct illumination and save images for each plate using CellProfiler +# # Correct illumination and save images for each plate using CellProfiler Parallel # ## Import libraries @@ -9,84 +9,80 @@ import pathlib +import pprint import sys -sys.path.append("../") -from utils import cp_utils +sys.path.append("../utils") +import cp_parallel -# ## Set paths for each plate -# -# Note: Output file path does not need to be strict since the `run_cellprofiler` function can create the output folder directory if it doesn't already exist. The other paths must be strict since these files should already exist for CellProfiler to run. The output directory doesn't need to already exist. +# ## Set paths and variables # In[2]: -plates_info_dictionary = { - "Plate_1": { - # this pipeline is specific to plates 1 and 2 (due to channel difference from plates 3 and 3 prime) - "path_to_pipeline": pathlib.Path("NF1_illum_Plates_1_2.cppipe").resolve( - strict=True - ), - "path_to_images": pathlib.Path("../0.download_data/Plate_1/").resolve( - strict=True - ), - "path_to_output": pathlib.Path("Corrected_Plate_1").resolve(), - }, - "Plate_2": { - # this pipeline is specific to plates 1 and 2 (due to channel difference from plates 3 and 3 prime) - "path_to_pipeline": pathlib.Path("NF1_illum_Plates_1_2.cppipe").resolve( - strict=True - ), - "path_to_images": pathlib.Path("../0.download_data/Plate_2/").resolve( - strict=True - ), - "path_to_output": pathlib.Path("Corrected_Plate_2").resolve(), - }, - "Plate_3": { - # this pipeline is specific to plates 3 and 3 prime (due to channel difference from plates 1 and 2) - "path_to_pipeline": pathlib.Path("NF1_illum_Plate3_Plate3prime.cppipe").resolve( - strict=True - ), - "path_to_images": pathlib.Path("../0.download_data/Plate_3/").resolve( +# set the run type for the parallelization +run_name = "illum_correction" + +# set path for pipeline for all plates for whole image analysis +path_to_pipeline = pathlib.Path("./pipelines/nuclei_analysis.cppipe").resolve() + +# set main output dir for all plates +output_dir = pathlib.Path("./Corrected_Images") +output_dir.mkdir(exist_ok=True) + +# directory where images are located within folders +images_dir = pathlib.Path("../0.download_data/") + +# list for plate names based on folders to use to create dictionary +plate_names = [] +# iterate through 0.download_data and append plate names from folder names that contain image data from that plate +for file_path in pathlib.Path("../0.download_data/").iterdir(): + if str(file_path.stem).startswith("Plate"): + plate_names.append(str(file_path.stem)) + + +# ## Create dictionary with all info for each plate + +# In[3]: + + +# create plate info dictionary with all parts of the CellProfiler CLI command to run in parallel +plate_info_dictionary = { + name: { + "path_to_images": pathlib.Path(list(images_dir.rglob(name))[0]).resolve( strict=True ), - "path_to_output": pathlib.Path("Corrected_Plate_3").resolve(), - }, - "Plate_3_prime": { - # this pipeline is specific to plates 3 and 3 prime (due to channel difference from plates 1 and 2) - "path_to_pipeline": pathlib.Path("NF1_illum_Plate3_Plate3prime.cppipe").resolve( + "path_to_output": pathlib.Path(f"{output_dir}/Corrected_{name}"), + } + for name in plate_names +} + +# iterate over the dictionary and add the path_to_pipeline specific for each plate +for name, info in plate_info_dictionary.items(): + # only plates 1 and 2 have 3 channels so these are the only plates that use this path + if name == "Plate_1" or name == "Plate_2": + info["path_to_pipeline"] = pathlib.Path(f"./NF1_illum_3channel.cppipe").resolve( strict=True - ), - "path_to_images": pathlib.Path("../0.download_data/Plate_3_prime/").resolve( + ) + # all other plates have 4 channels and will use that specific pipeline + else: + info["path_to_pipeline"] = pathlib.Path(f"./NF1_illum_4channel.cppipe").resolve( strict=True - ), - "path_to_output": pathlib.Path("Corrected_Plate_3_prime").resolve(), - }, -} + ) + +# view the dictionary to assess that all info is added correctly +pprint.pprint(plate_info_dictionary, indent=4) -# ## Run illumination correction pipeline on each plate +# ## Run illumination correction pipeline on each plate in parallel # # In this notebook, we do not run the cells to completion as we prefer to run the notebooks as nbconverted python files due to better stability. -# In[3]: +# In[4]: -# run through each plate with each set of paths based on dictionary -for plate, info in plates_info_dictionary.items(): - path_to_pipeline = info["path_to_pipeline"] - path_to_output = info["path_to_output"] - path_to_images = info["path_to_images"] - print(f"Correcting {plate}") - - # run illumination correction pipeline and save images - cp_utils.run_cellprofiler( - path_to_pipeline=path_to_pipeline, - path_to_output=path_to_output, - path_to_images=path_to_images, - # these variables are turned off for illum pipeline - sqlite_name=None, - analysis_run=False, - ) +cp_parallel.run_cellprofiler_parallel( + plate_info_dictionary=plate_info_dictionary, run_name=run_name +) diff --git a/2.cellprofiler_analysis/NF1_analysis_plate1_plate2.cppipe b/2.cellprofiler_analysis/NF1_analysis_3channel.cppipe similarity index 85% rename from 2.cellprofiler_analysis/NF1_analysis_plate1_plate2.cppipe rename to 2.cellprofiler_analysis/NF1_analysis_3channel.cppipe index 8449286..0d5f2b0 100644 --- a/2.cellprofiler_analysis/NF1_analysis_plate1_plate2.cppipe +++ b/2.cellprofiler_analysis/NF1_analysis_3channel.cppipe @@ -5,16 +5,16 @@ GitHash: ModuleCount:17 HasImagePlaneDetails:False -Images:[module_num:1|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['To begin creating your project, use the Images module to compile a list of files and/or folders that you want to analyze. You can also specify a set of rules to include only the desired files in your selected folders.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +Images:[module_num:1|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['Images module is left blank as we are giving the path to the corrected images in the CLI']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] : Filter images?:Images only Select the rule criteria:and (extension does isimage) (directory doesnot containregexp "[\\\\/]\\.") -Metadata:[module_num:2|svn_version:'Unknown'|variable_revision_number:6|show_window:False|notes:['Had to change the metadata regular expression to remove plate.The value it was associating as plate in the file name is arbitary and the same between all plates in the NF1 project. ', '', 'Plate is not extracted from the metadata so across-all-plate feature processing can not occur.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +Metadata:[module_num:2|svn_version:'Unknown'|variable_revision_number:6|show_window:False|notes:['Extract metadata from file names and folder names using regular expressions.', '', 'The only metadata that will be outputed in the SQLite DB file are:', '', 'Plate', 'Well', 'Site', '', 'The rest of the information is useful to make sure that the expression is working, but can be removed/not necessary.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Extract metadata?:Yes Metadata data type:Text Metadata types:{"Channel": "integer", "FileLocation": "text", "Frame": "text", "Plate": "text", "Series": "text", "Site": "integer", "Stain": "float", "Well": "text"} - Extraction method count:1 + Extraction method count:2 Metadata extraction method:Extract from file/folder names Metadata source:File name Regular expression to extract from file name:(?P[A-Z]{1}[0-9]{1,2})_01_(?P[1-3]{1})_(?P[1-4]{1})_(?PDAPI|GFP|RFP) @@ -26,8 +26,19 @@ Metadata:[module_num:2|svn_version:'Unknown'|variable_revision_number:6|show_win Use case insensitive matching?:No Metadata file name:None Does cached metadata exist?:No + Metadata extraction method:Extract from file/folder names + Metadata source:Folder name + Regular expression to extract from file name:^(?P.*)_(?P[A-P][0-9]{2})_s(?P[0-9])_w(?P[0-9]) + Regular expression to extract from folder name:Corrected_(?PPlate_[0-9]{1}) + Extract metadata from:All images + Select the filtering criteria:and (file does contain "") + Metadata file location:Elsewhere...| + Match file and image metadata:[] + Use case insensitive matching?:No + Metadata file name:None + Does cached metadata exist?:No -NamesAndTypes:[module_num:3|svn_version:'Unknown'|variable_revision_number:8|show_window:False|notes:['The NamesAndTypes module allows you to assign a meaningful name to each image by which other modules will refer to it.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +NamesAndTypes:[module_num:3|svn_version:'Unknown'|variable_revision_number:8|show_window:False|notes:['Assign files to their respective channel (only 3):', '', 'DAPI', 'GFP', 'RFP']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Assign a name to:Images matching rules Select the image type:Grayscale image Name to assign these images:DNA @@ -60,12 +71,12 @@ NamesAndTypes:[module_num:3|svn_version:'Unknown'|variable_revision_number:8|sho Set intensity range from:Image metadata Maximum intensity:255.0 -Groups:[module_num:4|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['The Groups module optionally allows you to split your list of images into image subsets (groups) which will be processed independently of each other. Examples of groupings include screening batches, microtiter plates, time-lapse movies, etc.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +Groups:[module_num:4|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['We do not use the Groups module.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Do you want to group your images?:No grouping metadata count:1 Metadata category:None -IdentifyPrimaryObjects:[module_num:5|svn_version:'Unknown'|variable_revision_number:15|show_window:False|notes:[]|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +IdentifyPrimaryObjects:[module_num:5|svn_version:'Unknown'|variable_revision_number:15|show_window:False|notes:['These are the current best parameters to segment nuclei from the DAPI channel']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Select the input image:DAPI Name the primary objects to be identified:Nuclei Typical diameter of objects, in pixel units (Min,Max):30,90 @@ -101,7 +112,7 @@ IdentifyPrimaryObjects:[module_num:5|svn_version:'Unknown'|variable_revision_num # of deviations:2.0 Thresholding method:Minimum Cross-Entropy -IdentifySecondaryObjects:[module_num:6|svn_version:'Unknown'|variable_revision_number:10|show_window:False|notes:[]|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +IdentifySecondaryObjects:[module_num:6|svn_version:'Unknown'|variable_revision_number:10|show_window:False|notes:['These are the current best parameters to segment whole cells using the RFP (actin) channel']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Select the input objects:Nuclei Name the objects to be identified:Cells Select the method to identify the secondary objects:Propagation @@ -131,7 +142,7 @@ IdentifySecondaryObjects:[module_num:6|svn_version:'Unknown'|variable_revision_n # of deviations:2.0 Thresholding method:Otsu -IdentifyTertiaryObjects:[module_num:7|svn_version:'Unknown'|variable_revision_number:3|show_window:False|notes:[]|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +IdentifyTertiaryObjects:[module_num:7|svn_version:'Unknown'|variable_revision_number:3|show_window:False|notes:['This module creates a third object from the first two where the nuclei is subtracted from the cells to create cytoplasm']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Select the larger identified objects:Cells Select the smaller identified objects:Nuclei Name the tertiary objects to be identified:Cytoplasm @@ -196,7 +207,7 @@ MeasureObjectNeighbors:[module_num:13|svn_version:'Unknown'|variable_revision_nu Name the output image:PercentTouching Select colormap:Oranges -MeasureObjectIntensityDistribution:[module_num:14|svn_version:'Unknown'|variable_revision_number:6|show_window:False|notes:['Measure object intensity distributions witin objects using center x,y coords for each single object.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +MeasureObjectIntensityDistribution:[module_num:14|svn_version:'Unknown'|variable_revision_number:6|show_window:False|notes:['Measure object intensity distributions witin objects using center x,y coords for each single object.', '', 'Currently we do not measure the Zernikes.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Select images to measure:DAPI, GFP, RFP Hidden:3 Hidden:1 @@ -239,7 +250,7 @@ ExportToDatabase:[module_num:17|svn_version:'Unknown'|variable_revision_number:2 Database host: Username: Password: - Name the SQLite database file:NF1_data.sqlite + Name the SQLite database file:nf1_analysis.sqlite Calculate the per-image mean values of object measurements?:No Calculate the per-image median values of object measurements?:No Calculate the per-image standard deviation values of object measurements?:No diff --git a/2.cellprofiler_analysis/NF1_analysis_plate3_plate3prime.cppipe b/2.cellprofiler_analysis/NF1_analysis_4channel.cppipe similarity index 87% rename from 2.cellprofiler_analysis/NF1_analysis_plate3_plate3prime.cppipe rename to 2.cellprofiler_analysis/NF1_analysis_4channel.cppipe index 388564d..5fae06e 100644 --- a/2.cellprofiler_analysis/NF1_analysis_plate3_plate3prime.cppipe +++ b/2.cellprofiler_analysis/NF1_analysis_4channel.cppipe @@ -5,16 +5,16 @@ GitHash: ModuleCount:17 HasImagePlaneDetails:False -Images:[module_num:1|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['To begin creating your project, use the Images module to compile a list of files and/or folders that you want to analyze. You can also specify a set of rules to include only the desired files in your selected folders.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +Images:[module_num:1|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['Images module is left blank since we are giving the path to the images in the CLI command']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] : Filter images?:Images only Select the rule criteria:and (extension does isimage) (directory doesnot containregexp "[\\\\/]\\.") -Metadata:[module_num:2|svn_version:'Unknown'|variable_revision_number:6|show_window:False|notes:['Like with plates 1 and 2, there is no plate metadata located in the file names for plates 3 and 3 prime. There is no plate metadata extracted and will not be avaliable in the exported SQLite file.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +Metadata:[module_num:2|svn_version:'Unknown'|variable_revision_number:6|show_window:False|notes:['Metadata is extracted from the file and folder names using regular expressions.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Extract metadata?:Yes Metadata data type:Text Metadata types:{"Channel": "integer", "FileLocation": "text", "Frame": "text", "Plate": "text", "Series": "text", "Site": "integer", "Stain": "float", "Well": "text"} - Extraction method count:1 + Extraction method count:2 Metadata extraction method:Extract from file/folder names Metadata source:File name Regular expression to extract from file name:(?P[A-Z]{1}[0-9]{1,2})_01_(?P[1-4]{1})_(?P[0-9]{1,2})_(?PDAPI|CY5|GFP|RFP) @@ -26,8 +26,19 @@ Metadata:[module_num:2|svn_version:'Unknown'|variable_revision_number:6|show_win Use case insensitive matching?:No Metadata file name:None Does cached metadata exist?:No + Metadata extraction method:Extract from file/folder names + Metadata source:Folder name + Regular expression to extract from file name:^(?P.*)_(?P[A-P][0-9]{2})_s(?P[0-9])_w(?P[0-9]) + Regular expression to extract from folder name:Corrected_(?PPlate_[0-9]{1}) + Extract metadata from:All images + Select the filtering criteria:and (file does contain "") + Metadata file location:Elsewhere...| + Match file and image metadata:[] + Use case insensitive matching?:No + Metadata file name:None + Does cached metadata exist?:No -NamesAndTypes:[module_num:3|svn_version:'Unknown'|variable_revision_number:8|show_window:False|notes:['Plate 3 and 3’ need to have their own pipeline from the first two plates due to difference in channels (added mito channel)']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +NamesAndTypes:[module_num:3|svn_version:'Unknown'|variable_revision_number:8|show_window:False|notes:['This pipeline assigns names to 4 channels specifically:', '', 'DAPI', 'GFP', 'CY5', 'RFP']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Assign a name to:Images matching rules Select the image type:Grayscale image Name to assign these images:DNA @@ -66,12 +77,12 @@ NamesAndTypes:[module_num:3|svn_version:'Unknown'|variable_revision_number:8|sho Set intensity range from:Image metadata Maximum intensity:255.0 -Groups:[module_num:4|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['The Groups module optionally allows you to split your list of images into image subsets (groups) which will be processed independently of each other. Examples of groupings include screening batches, microtiter plates, time-lapse movies, etc.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +Groups:[module_num:4|svn_version:'Unknown'|variable_revision_number:2|show_window:False|notes:['We do not use the Groups module.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Do you want to group your images?:No grouping metadata count:1 Metadata category:None -IdentifyPrimaryObjects:[module_num:5|svn_version:'Unknown'|variable_revision_number:15|show_window:False|notes:['The parameters are the same as the analysis pipeline for plates 1 and 2.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +IdentifyPrimaryObjects:[module_num:5|svn_version:'Unknown'|variable_revision_number:15|show_window:False|notes:['The parameters are the same as the analysis pipeline for 3 channels.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Select the input image:DAPI Name the primary objects to be identified:Nuclei Typical diameter of objects, in pixel units (Min,Max):30,90 @@ -107,7 +118,7 @@ IdentifyPrimaryObjects:[module_num:5|svn_version:'Unknown'|variable_revision_num # of deviations:2.0 Thresholding method:Minimum Cross-Entropy -IdentifySecondaryObjects:[module_num:6|svn_version:'Unknown'|variable_revision_number:10|show_window:False|notes:['The parameters are the same as the analysis pipeline for plates 1 and 2.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +IdentifySecondaryObjects:[module_num:6|svn_version:'Unknown'|variable_revision_number:10|show_window:False|notes:['The parameters are the same as the analysis pipeline for 3 channels.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Select the input objects:Nuclei Name the objects to be identified:Cells Select the method to identify the secondary objects:Propagation @@ -137,7 +148,7 @@ IdentifySecondaryObjects:[module_num:6|svn_version:'Unknown'|variable_revision_n # of deviations:2.0 Thresholding method:Otsu -IdentifyTertiaryObjects:[module_num:7|svn_version:'Unknown'|variable_revision_number:3|show_window:False|notes:['The parameters are the same as the analysis pipeline for plates 1 and 2.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +IdentifyTertiaryObjects:[module_num:7|svn_version:'Unknown'|variable_revision_number:3|show_window:False|notes:['The parameters are the same as the analysis pipeline for 3 channels.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Select the larger identified objects:Cells Select the smaller identified objects:Nuclei Name the tertiary objects to be identified:Cytoplasm @@ -202,7 +213,7 @@ MeasureObjectNeighbors:[module_num:13|svn_version:'Unknown'|variable_revision_nu Name the output image:PercentTouching Select colormap:Oranges -MeasureObjectIntensityDistribution:[module_num:14|svn_version:'Unknown'|variable_revision_number:6|show_window:False|notes:['Measure object intensity distributions witin objects using center x,y coords for each single object.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +MeasureObjectIntensityDistribution:[module_num:14|svn_version:'Unknown'|variable_revision_number:6|show_window:False|notes:['Measure object intensity distributions witin objects using center x,y coords for each single object.', '', 'We do not calculate Zernikes currently.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Select images to measure:CY5, DAPI, GFP, RFP Hidden:3 Hidden:1 @@ -222,7 +233,7 @@ MeasureObjectIntensityDistribution:[module_num:14|svn_version:'Unknown'|variable Number of bins:4 Maximum radius:100 -MeasureObjectSizeShape:[module_num:15|svn_version:'Unknown'|variable_revision_number:3|show_window:False|notes:['Measure object size and shape']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +MeasureObjectSizeShape:[module_num:15|svn_version:'Unknown'|variable_revision_number:3|show_window:False|notes:['Measure object size and shape.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Select object sets to measure:Cells, Cytoplasm, Nuclei Calculate the Zernike features?:Yes Calculate the advanced features?:No @@ -235,7 +246,7 @@ MeasureTexture:[module_num:16|svn_version:'Unknown'|variable_revision_number:7|s Measure whole images or objects?:Both Texture scale to measure:3 -ExportToDatabase:[module_num:17|svn_version:'Unknown'|variable_revision_number:28|show_window:False|notes:[]|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] +ExportToDatabase:[module_num:17|svn_version:'Unknown'|variable_revision_number:28|show_window:False|notes:['Output the morphology features into an SQLite database.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] Database type:SQLite Database name:DefaultDB Add a prefix to table names?:No @@ -245,7 +256,7 @@ ExportToDatabase:[module_num:17|svn_version:'Unknown'|variable_revision_number:2 Database host: Username: Password: - Name the SQLite database file:NF1_data.sqlite + Name the SQLite database file:nf1_analysis.sqlite Calculate the per-image mean values of object measurements?:No Calculate the per-image median values of object measurements?:No Calculate the per-image standard deviation values of object measurements?:No diff --git a/2.cellprofiler_analysis/README.md b/2.cellprofiler_analysis/README.md index 196020b..7b2f26d 100644 --- a/2.cellprofiler_analysis/README.md +++ b/2.cellprofiler_analysis/README.md @@ -16,9 +16,31 @@ cd 2.cellprofiler_analysis source nf1_analysis.sh ``` +## CellProfiler Parallel + +To improve the speed for analysis, we have implemented `CellProfiler Parallel`, which utilizes multi-processing to run one plate per CPU core. + +All analysis is performed on a Linux-based machine running Pop_OS! LTS 22.04 with an AMD Ryzen 7 3700X 8-Core Processor, 16 CPU cores, and 64GB of RAM. + +### Speed when running CellProfiler Parallel + +When using CellProfiler Parallel to run analysis on 5 plates, it took approximately 28 hours to finish. + +### Speed when running CellProfiler Sequential + To run analysis on plates 1 and 2, it took approximately one hour. -To run analysis on plates 3 and 3 prime, it took approximately 31 hours. -The analysis was run on a Linux-based machine running Pop_OS! LTS 22.04 with an AMD Ryzen 7 3700X 8-Core Processor. +To run analysis on plates 3 and 3 prime, it took approximately 31 hours (~ 16 hours each). + +#### Difference in speed + +When looking at the log files, a fifth plate (e.g., `Plate_4``) took about **24 hours** to processes. +Added on to the CellProfiler Sequential, which in total took 32 hours for 4 plates, it would likely have added on another 10-14 hours (given how long plates 3 and 3 prime took), totalling **over 45 hours**. + +Even though plates individually ran faster, the computational time saved by running them in parallel **saves over 20 hours of time**. + +What might be able to improve the individual processing time per plate could be to increase the number of workers (even though CellProfiler CLI should only be using one core per processes). +Currently, CellProfiler Parallel will automatically set the number of workers based on the number of commands (e.g., plates to be processed). +Depending on needs, a parameter can be added to manually set the max_workers. ## Accessing the CellProfiler output - SQLite files diff --git a/2.cellprofiler_analysis/analysis_output/Plate_1.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_1.sqlite deleted file mode 100644 index 7ba0520..0000000 --- a/2.cellprofiler_analysis/analysis_output/Plate_1.sqlite +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a3336fcf89ea296832c4f3ca7a532f2c3124e7aba5249f30a4899e5eda2d67e4 -size 4001792 diff --git a/2.cellprofiler_analysis/analysis_output/Plate_1/nf1_analysis.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_1/nf1_analysis.sqlite new file mode 100644 index 0000000..331bff3 --- /dev/null +++ b/2.cellprofiler_analysis/analysis_output/Plate_1/nf1_analysis.sqlite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13de543486df0a97344ef40b2dec46562d9d5190515136251020481bfd91fd66 +size 4005888 diff --git a/2.cellprofiler_analysis/analysis_output/Plate_2.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_2.sqlite deleted file mode 100644 index 166d1a0..0000000 --- a/2.cellprofiler_analysis/analysis_output/Plate_2.sqlite +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66c14464eb2da55c37d0bdb11792a344775511db9b9dfdb330b8168044e16107 -size 26222592 diff --git a/2.cellprofiler_analysis/analysis_output/Plate_2/nf1_analysis.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_2/nf1_analysis.sqlite new file mode 100644 index 0000000..747ad16 --- /dev/null +++ b/2.cellprofiler_analysis/analysis_output/Plate_2/nf1_analysis.sqlite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0beb41c53315d84af9818526eac1a7e393fa2feb44e9c1eae6f940360a03771 +size 26226688 diff --git a/2.cellprofiler_analysis/analysis_output/Plate_3.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_3.sqlite deleted file mode 100644 index bc2d42f..0000000 --- a/2.cellprofiler_analysis/analysis_output/Plate_3.sqlite +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:97463933d94ca926a7ac5b358f73c3454dc4c309ea38e7b4d44c743a1159728b -size 329084928 diff --git a/2.cellprofiler_analysis/analysis_output/Plate_3/nf1_analysis.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_3/nf1_analysis.sqlite new file mode 100644 index 0000000..960a7e9 --- /dev/null +++ b/2.cellprofiler_analysis/analysis_output/Plate_3/nf1_analysis.sqlite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0688c07ac7c3ed6b4f2fccefd3bd4553252005c11ebd184660c498e452474252 +size 329482240 diff --git a/2.cellprofiler_analysis/analysis_output/Plate_3_prime.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_3_prime.sqlite deleted file mode 100644 index a54b835..0000000 --- a/2.cellprofiler_analysis/analysis_output/Plate_3_prime.sqlite +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3308860ebffdcdec3a9d709da0890f58becf947a8a218faf430178833afe8982 -size 266272768 diff --git a/2.cellprofiler_analysis/analysis_output/Plate_3_prime/nf1_analysis.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_3_prime/nf1_analysis.sqlite new file mode 100644 index 0000000..7afcb65 --- /dev/null +++ b/2.cellprofiler_analysis/analysis_output/Plate_3_prime/nf1_analysis.sqlite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99ffd797a3afe836e107dbd32c7b70f1fa050b7b6b80f1479b96c89e0797b3bb +size 266534912 diff --git a/2.cellprofiler_analysis/analysis_output/Plate_4/nf1_analysis.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_4/nf1_analysis.sqlite new file mode 100644 index 0000000..0e9e27a --- /dev/null +++ b/2.cellprofiler_analysis/analysis_output/Plate_4/nf1_analysis.sqlite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:954032099b6969f32241cde62588090ddd314255001505c44b29fa7d45d71d92 +size 134832128 diff --git a/2.cellprofiler_analysis/nf1_analysis.ipynb b/2.cellprofiler_analysis/nf1_analysis.ipynb index 7873442..84fb6ca 100644 --- a/2.cellprofiler_analysis/nf1_analysis.ipynb +++ b/2.cellprofiler_analysis/nf1_analysis.ipynb @@ -5,7 +5,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Perform segmentation and feature extraction for each plate using CellProfiler" + "# Perform segmentation and feature extraction for each plate using CellProfiler Parallel" ] }, { @@ -22,11 +22,13 @@ "metadata": {}, "outputs": [], "source": [ - "import sys\n", "import pathlib\n", + "import pprint\n", + "\n", + "import sys\n", "\n", - "sys.path.append(\"../\")\n", - "from utils import cp_utils" + "sys.path.append(\"../utils\")\n", + "import cp_parallel" ] }, { @@ -34,9 +36,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Set paths for each plate\n", - "\n", - "Note: Due to the different channel numbers between plates 1 + 2 (3 channels) and plates 3 + 3 prime (4 channels), there needs to be two different cppipe files (like in the IC module). " + "## Set paths and variables" ] }, { @@ -45,59 +45,29 @@ "metadata": {}, "outputs": [], "source": [ - "# create output directory for SQLite files if needed\n", - "path_to_output = pathlib.Path(\"./analysis_output\").resolve()\n", - "path_to_output.mkdir(exist_ok=True)\n", + "# set the run type for the parallelization\n", + "run_name = \"analysis\"\n", "\n", - "# dictionary with paths for each plate\n", - "plates_info_dictionary = {\n", - " \"Plate_1\": {\n", - " # this pipeline is specific to plates 1 and 2\n", - " \"path_to_pipeline\": pathlib.Path(\"NF1_analysis_plate1_plate2.cppipe\").resolve(\n", - " strict=True\n", - " ),\n", - " \"path_to_images\": pathlib.Path(\n", - " \"../1.cellprofiler_ic/Corrected_Plate_1/\"\n", - " ).resolve(strict=True),\n", - " },\n", - " \"Plate_2\": {\n", - " # this pipeline is specific to plates 1 and 2\n", - " \"path_to_pipeline\": pathlib.Path(\"NF1_analysis_plate1_plate2.cppipe\").resolve(\n", - " strict=True\n", - " ),\n", - " \"path_to_images\": pathlib.Path(\n", - " \"../1.cellprofiler_ic/Corrected_Plate_2/\"\n", - " ).resolve(strict=True),\n", - " },\n", - " \"Plate_3\": {\n", - " # this pipeline is specific to plates 3 and 3'\n", - " \"path_to_pipeline\": pathlib.Path(\"NF1_analysis_plate3_plate3prime.cppipe\").resolve(\n", - " strict=True\n", - " ),\n", - " \"path_to_images\": pathlib.Path(\n", - " \"../1.cellprofiler_ic/Corrected_Plate_3/\"\n", - " ).resolve(strict=True),\n", - " },\n", - " \"Plate_3_prime\": {\n", - " # this pipeline is specific to plates 3 and 3'\n", - " \"path_to_pipeline\": pathlib.Path(\"NF1_analysis_plate3_plate3prime.cppipe\").resolve(\n", - " strict=True\n", - " ),\n", - " \"path_to_images\": pathlib.Path(\n", - " \"../1.cellprofiler_ic/Corrected_Plate_3_prime/\"\n", - " ).resolve(strict=True),\n", - " },\n", - "}" + "# set main output dir for all plates\n", + "output_dir = pathlib.Path(\"./analysis_output\")\n", + "output_dir.mkdir(exist_ok=True)\n", + "\n", + "# directory where images are located within folders\n", + "images_dir = pathlib.Path(\"../1.cellprofiler_ic/Corrected_Images/\")\n", + "\n", + "# list for plate names based on folders to use to create dictionary\n", + "plate_names = []\n", + "# iterate through 0.download_data and append plate names from folder names that contain image data from that plate\n", + "for file_path in pathlib.Path(\"../0.download_data/\").iterdir():\n", + " if str(file_path.stem).startswith(\"Plate\"):\n", + " plate_names.append(str(file_path.stem))" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Run analysis pipeline on each plate\n", - "\n", - "This cell is not finished to completion due to how long it would take. It is ran in the python file instead." + "## Create dictionary with all info for each plate" ] }, { @@ -109,13 +79,68 @@ "name": "stdout", "output_type": "stream", "text": [ - "Running analysis on Plate_1!\n", - "Starting CellProfiler run on Corrected_Plate_1\n", - "The file is renamed to Plate_1.sqlite!\n", - "Running analysis on Plate_2!\n", - "Starting CellProfiler run on Corrected_Plate_2\n" + "{ 'Plate_1': { 'path_to_images': PosixPath('/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_1'),\n", + " 'path_to_output': PosixPath('analysis_output/Plate_1'),\n", + " 'path_to_pipeline': PosixPath('/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/NF1_analysis_3channel.cppipe')},\n", + " 'Plate_2': { 'path_to_images': PosixPath('/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2'),\n", + " 'path_to_output': PosixPath('analysis_output/Plate_2'),\n", + " 'path_to_pipeline': PosixPath('/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/NF1_analysis_3channel.cppipe')},\n", + " 'Plate_3': { 'path_to_images': PosixPath('/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_3'),\n", + " 'path_to_output': PosixPath('analysis_output/Plate_3'),\n", + " 'path_to_pipeline': PosixPath('/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/NF1_analysis_4channel.cppipe')},\n", + " 'Plate_3_prime': { 'path_to_images': PosixPath('/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_3_prime'),\n", + " 'path_to_output': PosixPath('analysis_output/Plate_3_prime'),\n", + " 'path_to_pipeline': PosixPath('/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/NF1_analysis_4channel.cppipe')},\n", + " 'Plate_4': { 'path_to_images': PosixPath('/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_4'),\n", + " 'path_to_output': PosixPath('analysis_output/Plate_4'),\n", + " 'path_to_pipeline': PosixPath('/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/NF1_analysis_4channel.cppipe')}}\n" ] - }, + } + ], + "source": [ + "# create plate info dictionary with all parts of the CellProfiler CLI command to run in parallel\n", + "plate_info_dictionary = {\n", + " name: {\n", + " \"path_to_images\": pathlib.Path(\n", + " list(images_dir.rglob(f\"Corrected_{name}\"))[0]\n", + " ).resolve(strict=True),\n", + " \"path_to_output\": pathlib.Path(f\"{output_dir}/{name}\"),\n", + " }\n", + " for name in plate_names\n", + "}\n", + "\n", + "# iterate over the dictionary and add the path_to_pipeline specific for each plate\n", + "for name, info in plate_info_dictionary.items():\n", + " # only plates 1 and 2 have 3 channels so these are the only plates that use this path\n", + " if name == \"Plate_1\" or name == \"Plate_2\":\n", + " info[\"path_to_pipeline\"] = pathlib.Path(\n", + " f\"./NF1_analysis_3channel.cppipe\"\n", + " ).resolve(strict=True)\n", + " # all other plates have 4 channels and will use that specific pipeline\n", + " else:\n", + " info[\"path_to_pipeline\"] = pathlib.Path(\n", + " f\"./NF1_analysis_4channel.cppipe\"\n", + " ).resolve(strict=True)\n", + "\n", + "# view the dictionary to assess that all info is added correctly\n", + "pprint.pprint(plate_info_dictionary, indent=4)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run analysis pipeline on each plate in parallel\n", + "\n", + "This cell is not finished to completion due to how long it would take. It is ran in the python file instead." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", @@ -123,33 +148,19 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[3], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mRunning analysis on \u001b[39m\u001b[39m{\u001b[39;00mplate\u001b[39m}\u001b[39;00m\u001b[39m!\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m \u001b[39m# run analysis pipeline\u001b[39;00m\n\u001b[0;32m----> 7\u001b[0m cp_utils\u001b[39m.\u001b[39;49mrun_cellprofiler(\n\u001b[1;32m 8\u001b[0m path_to_pipeline\u001b[39m=\u001b[39;49mpath_to_pipeline,\n\u001b[1;32m 9\u001b[0m path_to_output\u001b[39m=\u001b[39;49mpath_to_output,\n\u001b[1;32m 10\u001b[0m path_to_images\u001b[39m=\u001b[39;49mpath_to_images,\n\u001b[1;32m 11\u001b[0m \u001b[39m# name each SQLite file after plate name\u001b[39;49;00m\n\u001b[1;32m 12\u001b[0m sqlite_name\u001b[39m=\u001b[39;49mplate,\n\u001b[1;32m 13\u001b[0m analysis_run\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m,\n\u001b[1;32m 14\u001b[0m )\n", - "File \u001b[0;32m~/nf1_cellpainting_data/2.cellprofiler_analysis/../utils/cp_utils.py:68\u001b[0m, in \u001b[0;36mrun_cellprofiler\u001b[0;34m(path_to_pipeline, path_to_output, path_to_images, sqlite_name, analysis_run)\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(pathlib\u001b[39m.\u001b[39mPath(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mlogs/cellprofiler_output_\u001b[39m\u001b[39m{\u001b[39;00mpathlib\u001b[39m.\u001b[39mPath(path_to_images)\u001b[39m.\u001b[39mname\u001b[39m}\u001b[39;00m\u001b[39m.log\u001b[39m\u001b[39m\"\u001b[39m), \u001b[39m\"\u001b[39m\u001b[39mw\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m cellprofiler_output_file:\n\u001b[1;32m 66\u001b[0m \u001b[39m# run CellProfiler for a illumination correction pipeline\u001b[39;00m\n\u001b[1;32m 67\u001b[0m command \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcellprofiler -c -r -p \u001b[39m\u001b[39m{\u001b[39;00mpath_to_pipeline\u001b[39m}\u001b[39;00m\u001b[39m -o \u001b[39m\u001b[39m{\u001b[39;00mpath_to_output\u001b[39m}\u001b[39;00m\u001b[39m -i \u001b[39m\u001b[39m{\u001b[39;00mpath_to_images\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m---> 68\u001b[0m subprocess\u001b[39m.\u001b[39;49mrun(command, shell\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, stdout\u001b[39m=\u001b[39;49mcellprofiler_output_file, stderr\u001b[39m=\u001b[39;49mcellprofiler_output_file)\n\u001b[1;32m 69\u001b[0m cellprofiler_output_file\u001b[39m.\u001b[39mclose()\n\u001b[1;32m 71\u001b[0m \u001b[39mif\u001b[39;00m analysis_run:\n\u001b[1;32m 72\u001b[0m \u001b[39m# runs through any files that are in the output path\u001b[39;00m\n", - "File \u001b[0;32m~/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/subprocess.py:495\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m 493\u001b[0m \u001b[39mwith\u001b[39;00m Popen(\u001b[39m*\u001b[39mpopenargs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs) \u001b[39mas\u001b[39;00m process:\n\u001b[1;32m 494\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 495\u001b[0m stdout, stderr \u001b[39m=\u001b[39m process\u001b[39m.\u001b[39;49mcommunicate(\u001b[39minput\u001b[39;49m, timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[1;32m 496\u001b[0m \u001b[39mexcept\u001b[39;00m TimeoutExpired \u001b[39mas\u001b[39;00m exc:\n\u001b[1;32m 497\u001b[0m process\u001b[39m.\u001b[39mkill()\n", - "File \u001b[0;32m~/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/subprocess.py:1020\u001b[0m, in \u001b[0;36mPopen.communicate\u001b[0;34m(self, input, timeout)\u001b[0m\n\u001b[1;32m 1018\u001b[0m stderr \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstderr\u001b[39m.\u001b[39mread()\n\u001b[1;32m 1019\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstderr\u001b[39m.\u001b[39mclose()\n\u001b[0;32m-> 1020\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mwait()\n\u001b[1;32m 1021\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1022\u001b[0m \u001b[39mif\u001b[39;00m timeout \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/subprocess.py:1083\u001b[0m, in \u001b[0;36mPopen.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1081\u001b[0m endtime \u001b[39m=\u001b[39m _time() \u001b[39m+\u001b[39m timeout\n\u001b[1;32m 1082\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 1083\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_wait(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[1;32m 1084\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m 1085\u001b[0m \u001b[39m# https://bugs.python.org/issue25942\u001b[39;00m\n\u001b[1;32m 1086\u001b[0m \u001b[39m# The first keyboard interrupt waits briefly for the child to\u001b[39;00m\n\u001b[1;32m 1087\u001b[0m \u001b[39m# exit under the common assumption that it also received the ^C\u001b[39;00m\n\u001b[1;32m 1088\u001b[0m \u001b[39m# generated SIGINT and will exit rapidly.\u001b[39;00m\n\u001b[1;32m 1089\u001b[0m \u001b[39mif\u001b[39;00m timeout \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/subprocess.py:1806\u001b[0m, in \u001b[0;36mPopen._wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1804\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturncode \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 1805\u001b[0m \u001b[39mbreak\u001b[39;00m \u001b[39m# Another thread waited.\u001b[39;00m\n\u001b[0;32m-> 1806\u001b[0m (pid, sts) \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_try_wait(\u001b[39m0\u001b[39;49m)\n\u001b[1;32m 1807\u001b[0m \u001b[39m# Check the pid and loop as waitpid has been known to\u001b[39;00m\n\u001b[1;32m 1808\u001b[0m \u001b[39m# return 0 even without WNOHANG in odd situations.\u001b[39;00m\n\u001b[1;32m 1809\u001b[0m \u001b[39m# http://bugs.python.org/issue14396.\u001b[39;00m\n\u001b[1;32m 1810\u001b[0m \u001b[39mif\u001b[39;00m pid \u001b[39m==\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpid:\n", - "File \u001b[0;32m~/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/subprocess.py:1764\u001b[0m, in \u001b[0;36mPopen._try_wait\u001b[0;34m(self, wait_flags)\u001b[0m\n\u001b[1;32m 1762\u001b[0m \u001b[39m\"\"\"All callers to this function MUST hold self._waitpid_lock.\"\"\"\u001b[39;00m\n\u001b[1;32m 1763\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 1764\u001b[0m (pid, sts) \u001b[39m=\u001b[39m os\u001b[39m.\u001b[39;49mwaitpid(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpid, wait_flags)\n\u001b[1;32m 1765\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mChildProcessError\u001b[39;00m:\n\u001b[1;32m 1766\u001b[0m \u001b[39m# This happens if SIGCLD is set to be ignored or waiting\u001b[39;00m\n\u001b[1;32m 1767\u001b[0m \u001b[39m# for child processes has otherwise been disabled for our\u001b[39;00m\n\u001b[1;32m 1768\u001b[0m \u001b[39m# process. This child is dead, we can't get the status.\u001b[39;00m\n\u001b[1;32m 1769\u001b[0m pid \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpid\n", + "Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cp_parallel\u001b[39m.\u001b[39;49mrun_cellprofiler_parallel(\n\u001b[1;32m 2\u001b[0m plate_info_dictionary\u001b[39m=\u001b[39;49mplate_info_dictionary, run_name\u001b[39m=\u001b[39;49mrun_name\n\u001b[1;32m 3\u001b[0m )\n", + "File \u001b[0;32m~/nf1_cellpainting_data/2.cellprofiler_analysis/../utils/cp_parallel.py:119\u001b[0m, in \u001b[0;36mrun_cellprofiler_parallel\u001b[0;34m(plate_info_dictionary, run_name)\u001b[0m\n\u001b[1;32m 109\u001b[0m futures: List[Future] \u001b[39m=\u001b[39m [\n\u001b[1;32m 110\u001b[0m executor\u001b[39m.\u001b[39msubmit(\n\u001b[1;32m 111\u001b[0m subprocess\u001b[39m.\u001b[39mrun,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[39mfor\u001b[39;00m command \u001b[39min\u001b[39;00m commands\n\u001b[1;32m 116\u001b[0m ]\n\u001b[1;32m 118\u001b[0m \u001b[39m# the list of CompletedProcesses holds all the information from the CellProfiler run\u001b[39;00m\n\u001b[0;32m--> 119\u001b[0m results: List[subprocess\u001b[39m.\u001b[39mCompletedProcess] \u001b[39m=\u001b[39m [future\u001b[39m.\u001b[39mresult() \u001b[39mfor\u001b[39;00m future \u001b[39min\u001b[39;00m futures]\n\u001b[1;32m 121\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mAll processes have been completed!\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 123\u001b[0m \u001b[39m# for each process, confirm that the process completed succesfully and return a log file\u001b[39;00m\n", + "File \u001b[0;32m~/nf1_cellpainting_data/2.cellprofiler_analysis/../utils/cp_parallel.py:119\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 109\u001b[0m futures: List[Future] \u001b[39m=\u001b[39m [\n\u001b[1;32m 110\u001b[0m executor\u001b[39m.\u001b[39msubmit(\n\u001b[1;32m 111\u001b[0m subprocess\u001b[39m.\u001b[39mrun,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[39mfor\u001b[39;00m command \u001b[39min\u001b[39;00m commands\n\u001b[1;32m 116\u001b[0m ]\n\u001b[1;32m 118\u001b[0m \u001b[39m# the list of CompletedProcesses holds all the information from the CellProfiler run\u001b[39;00m\n\u001b[0;32m--> 119\u001b[0m results: List[subprocess\u001b[39m.\u001b[39mCompletedProcess] \u001b[39m=\u001b[39m [future\u001b[39m.\u001b[39;49mresult() \u001b[39mfor\u001b[39;00m future \u001b[39min\u001b[39;00m futures]\n\u001b[1;32m 121\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mAll processes have been completed!\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 123\u001b[0m \u001b[39m# for each process, confirm that the process completed succesfully and return a log file\u001b[39;00m\n", + "File \u001b[0;32m~/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/concurrent/futures/_base.py:439\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_state \u001b[39m==\u001b[39m FINISHED:\n\u001b[1;32m 437\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m__get_result()\n\u001b[0;32m--> 439\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_condition\u001b[39m.\u001b[39;49mwait(timeout)\n\u001b[1;32m 441\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_state \u001b[39min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n\u001b[1;32m 442\u001b[0m \u001b[39mraise\u001b[39;00m CancelledError()\n", + "File \u001b[0;32m~/anaconda3/envs/nf1_cellpainting_data/lib/python3.8/threading.py:302\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[39mtry\u001b[39;00m: \u001b[39m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m 301\u001b[0m \u001b[39mif\u001b[39;00m timeout \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 302\u001b[0m waiter\u001b[39m.\u001b[39;49macquire()\n\u001b[1;32m 303\u001b[0m gotit \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m 304\u001b[0m \u001b[39melse\u001b[39;00m:\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ - "# run through each plate with each set of paths based on dictionary\n", - "for plate, info in plates_info_dictionary.items():\n", - " path_to_pipeline = info[\"path_to_pipeline\"]\n", - " path_to_images = info[\"path_to_images\"]\n", - " print(f\"Running analysis on {plate}!\")\n", - "\n", - " # run analysis pipeline\n", - " cp_utils.run_cellprofiler(\n", - " path_to_pipeline=path_to_pipeline,\n", - " path_to_output=path_to_output,\n", - " path_to_images=path_to_images,\n", - " # name each SQLite file after plate name\n", - " sqlite_name=plate,\n", - " analysis_run=True,\n", - " )" + "cp_parallel.run_cellprofiler_parallel(\n", + " plate_info_dictionary=plate_info_dictionary, run_name=run_name\n", + ")" ] } ], diff --git a/2.cellprofiler_analysis/scripts/nf1_analysis.py b/2.cellprofiler_analysis/scripts/nf1_analysis.py index d124894..e08b8d9 100644 --- a/2.cellprofiler_analysis/scripts/nf1_analysis.py +++ b/2.cellprofiler_analysis/scripts/nf1_analysis.py @@ -1,92 +1,86 @@ #!/usr/bin/env python # coding: utf-8 -# # Perform segmentation and feature extraction for each plate using CellProfiler +# # Perform segmentation and feature extraction for each plate using CellProfiler Parallel # ## Import libraries # In[1]: -import sys import pathlib +import pprint -sys.path.append("../") -from utils import cp_utils +import sys +sys.path.append("../utils") +import cp_parallel -# ## Set paths for each plate -# -# Note: Due to the different channel numbers between plates 1 + 2 (3 channels) and plates 3 + 3 prime (4 channels), there needs to be two different cppipe files (like in the IC module). + +# ## Set paths and variables # In[2]: -# create output directory for SQLite files if needed -path_to_output = pathlib.Path("./analysis_output").resolve() -path_to_output.mkdir(exist_ok=True) +# set the run type for the parallelization +run_name = "analysis" -# dictionary with paths for each plate -plates_info_dictionary = { - "Plate_1": { - # this pipeline is specific to plates 1 and 2 - "path_to_pipeline": pathlib.Path("NF1_analysis_plate1_plate2.cppipe").resolve( - strict=True - ), - "path_to_images": pathlib.Path( - "../1.cellprofiler_ic/Corrected_Plate_1/" - ).resolve(strict=True), - }, - "Plate_2": { - # this pipeline is specific to plates 1 and 2 - "path_to_pipeline": pathlib.Path("NF1_analysis_plate1_plate2.cppipe").resolve( - strict=True - ), - "path_to_images": pathlib.Path( - "../1.cellprofiler_ic/Corrected_Plate_2/" - ).resolve(strict=True), - }, - "Plate_3": { - # this pipeline is specific to plates 3 and 3' - "path_to_pipeline": pathlib.Path("NF1_analysis_plate3_plate3prime.cppipe").resolve( - strict=True - ), - "path_to_images": pathlib.Path( - "../1.cellprofiler_ic/Corrected_Plate_3/" - ).resolve(strict=True), - }, - "Plate_3_prime": { - # this pipeline is specific to plates 3 and 3' - "path_to_pipeline": pathlib.Path("NF1_analysis_plate3_plate3prime.cppipe").resolve( - strict=True - ), +# set main output dir for all plates +output_dir = pathlib.Path("./analysis_output") +output_dir.mkdir(exist_ok=True) + +# directory where images are located within folders +images_dir = pathlib.Path("../1.cellprofiler_ic/Corrected_Images/") + +# list for plate names based on folders to use to create dictionary +plate_names = [] +# iterate through 0.download_data and append plate names from folder names that contain image data from that plate +for file_path in pathlib.Path("../0.download_data/").iterdir(): + if str(file_path.stem).startswith("Plate"): + plate_names.append(str(file_path.stem)) + + +# ## Create dictionary with all info for each plate + +# In[3]: + + +# create plate info dictionary with all parts of the CellProfiler CLI command to run in parallel +plate_info_dictionary = { + name: { "path_to_images": pathlib.Path( - "../1.cellprofiler_ic/Corrected_Plate_3_prime/" + list(images_dir.rglob(f"Corrected_{name}"))[0] ).resolve(strict=True), - }, + "path_to_output": pathlib.Path(f"{output_dir}/{name}"), + } + for name in plate_names } +# iterate over the dictionary and add the path_to_pipeline specific for each plate +for name, info in plate_info_dictionary.items(): + # only plates 1 and 2 have 3 channels so these are the only plates that use this path + if name == "Plate_1" or name == "Plate_2": + info["path_to_pipeline"] = pathlib.Path( + f"./NF1_analysis_3channel.cppipe" + ).resolve(strict=True) + # all other plates have 4 channels and will use that specific pipeline + else: + info["path_to_pipeline"] = pathlib.Path( + f"./NF1_analysis_4channel.cppipe" + ).resolve(strict=True) + +# view the dictionary to assess that all info is added correctly +pprint.pprint(plate_info_dictionary, indent=4) -# ## Run analysis pipeline on each plate + +# ## Run analysis pipeline on each plate in parallel # # This cell is not finished to completion due to how long it would take. It is ran in the python file instead. -# In[3]: +# In[4]: -# run through each plate with each set of paths based on dictionary -for plate, info in plates_info_dictionary.items(): - path_to_pipeline = info["path_to_pipeline"] - path_to_images = info["path_to_images"] - print(f"Running analysis on {plate}!") - - # run analysis pipeline - cp_utils.run_cellprofiler( - path_to_pipeline=path_to_pipeline, - path_to_output=path_to_output, - path_to_images=path_to_images, - # name each SQLite file after plate name - sqlite_name=plate, - analysis_run=True, - ) +cp_parallel.run_cellprofiler_parallel( + plate_info_dictionary=plate_info_dictionary, run_name=run_name +) diff --git a/utils/cp_parallel.py b/utils/cp_parallel.py new file mode 100644 index 0000000..d2acfe6 --- /dev/null +++ b/utils/cp_parallel.py @@ -0,0 +1,134 @@ +""" +This collection of functions runs CellProfiler in parallel and can convert the results into log files +for each process. +""" + +import multiprocessing +from typing import List +import os +import subprocess +import pathlib +from concurrent.futures import ProcessPoolExecutor, Future +from errors.exceptions import MaxWorkerError + + +def results_to_log( + results: List[subprocess.CompletedProcess], log_dir: pathlib.Path, run_name: str +) -> None: + """ + This function will take the list of subprocess.results from a CellProfiler parallelization run and + convert into a log file for each process. + + Args: + results (List[subprocess.CompletedProcess]): the outputs from a subprocess.run + log_dir (pathlib.Path): directory for log files + run_name (str): a given name for the type of CellProfiler run being done on the plates (example: whole image features) + """ + # Access the command (args) and stderr (output) for each CompletedProcess object + for result in results: + # assign plate name and decode the CellProfiler output to use in log file + plate_name = result.args[6].name + output_string = result.stderr.decode("utf-8") + + # set log file name as plate name from command + log_file_path = pathlib.Path(f"{log_dir}/{plate_name}_{run_name}_run.log") + # print output to a log file for each plate to view after the run + with open(log_file_path, "w") as log_file: + log_file.write(plate_name + "\n") + log_file.write(output_string + "\n") + + +def run_cellprofiler_parallel( + plate_info_dictionary: dict, + run_name: str, +) -> None: + """ + This function utilizes multi-processing to run CellProfiler pipelines in parallel. + + Args: + plate_info_dictionary (dict): dictionary with all paths for CellProfiler to run a pipeline + run_name (str): a given name for the type of CellProfiler run being done on the plates (example: whole image features) + + Raises: + FileNotFoundError: if paths to pipeline and images do not exist + """ + # create a list of commands for each plate with their respective log file + commands = [] + + # make logs directory + log_dir = pathlib.Path("./logs") + os.makedirs(log_dir, exist_ok=True) + + # iterate through each plate in the dictionary + for _, info in plate_info_dictionary.items(): + # set paths for CellProfiler + path_to_pipeline = info["path_to_pipeline"] + path_to_images = info["path_to_images"] + path_to_output = info["path_to_output"] + + # check to make sure paths to pipeline and directory of images are correct before running the pipeline + if not pathlib.Path(path_to_pipeline).resolve(strict=True): + raise FileNotFoundError( + f"The file '{pathlib.Path(path_to_pipeline).name}' does not exist" + ) + if not pathlib.Path(path_to_images).is_dir(): + raise FileNotFoundError( + f"Directory '{pathlib.Path(path_to_images).name}' does not exist or is not a directory" + ) + # make output directory if it is not already created + pathlib.Path(path_to_output).mkdir(exist_ok=True) + + # creates a command for each plate in the list + command = [ + "cellprofiler", + "-c", + "-r", + "-p", + path_to_pipeline, + "-o", + path_to_output, + "-i", + path_to_images, + ] + # creates a list of commands + commands.append(command) + + # set the number of CPUs/workers as the number of commands + num_processes = len(commands) + + # make sure that the number of workers does not exceed the maximum number of workers for the machine + if num_processes > multiprocessing.cpu_count(): + raise MaxWorkerError( + "Exception occurred: The number of commands exceeds the number of CPUs/workers. Please reduce the number of commands." + ) + + # set parallelization executer to the number of commands + executor = ProcessPoolExecutor(max_workers=num_processes) + + # creates a list of futures that are each CellProfiler process for each plate + futures: List[Future] = [ + executor.submit( + subprocess.run, + args=command, + capture_output=True, + ) + for command in commands + ] + + # the list of CompletedProcesses holds all the information from the CellProfiler run + results: List[subprocess.CompletedProcess] = [future.result() for future in futures] + + print("All processes have been completed!") + + # for each process, confirm that the process completed succesfully and return a log file + for result in results: + plate_name = result.args[6].name + # convert the results into log files + results_to_log(results=results, log_dir=log_dir, run_name=run_name) + if result.returncode == 1: + print( + f"A return code of {result.returncode} was returned for {plate_name}, which means there was an error in the CellProfiler run." + ) + + # to avoid having multiple print statements due to for loop, confirmation that logs are converted is printed here + print("All results have been converted to log files!") diff --git a/utils/cp_utils.py b/utils/cp_sequential.py similarity index 100% rename from utils/cp_utils.py rename to utils/cp_sequential.py diff --git a/utils/errors/exceptions.py b/utils/errors/exceptions.py new file mode 100644 index 0000000..71b1426 --- /dev/null +++ b/utils/errors/exceptions.py @@ -0,0 +1,9 @@ +""" +This class defines a custom exception class for exceeding the max workers on a machine. +""" + +class MaxWorkerError(Exception): + """ + Raised when the number of workers assigned to `max_workers` exceeds the number of CPU/workers on the machine. + """ + pass