From 0c0b9b4c1623e9e636a7f5c27807b2f3df0082c6 Mon Sep 17 00:00:00 2001 From: jenna-tomkinson Date: Mon, 17 Jul 2023 11:21:34 -0600 Subject: [PATCH 1/5] add rename SQLite files --- ...sis.sqlite => Plate_1_nf1_analysis.sqlite} | 0 ...sis.sqlite => Plate_2_nf1_analysis.sqlite} | 0 ...sis.sqlite => Plate_3_nf1_analysis.sqlite} | 0 ...lite => Plate_3_prime_nf1_analysis.sqlite} | 0 ...sis.sqlite => Plate_4_nf1_analysis.sqlite} | 0 2.cellprofiler_analysis/nf1_analysis.sh | 10 +- .../rename_sqlite_files.ipynb | 108 ++++++++++++++++++ .../scripts/rename_sqlite_files.py | 43 +++++++ 8 files changed, 157 insertions(+), 4 deletions(-) rename 2.cellprofiler_analysis/analysis_output/Plate_1/{nf1_analysis.sqlite => Plate_1_nf1_analysis.sqlite} (100%) rename 2.cellprofiler_analysis/analysis_output/Plate_2/{nf1_analysis.sqlite => Plate_2_nf1_analysis.sqlite} (100%) rename 2.cellprofiler_analysis/analysis_output/Plate_3/{nf1_analysis.sqlite => Plate_3_nf1_analysis.sqlite} (100%) rename 2.cellprofiler_analysis/analysis_output/Plate_3_prime/{nf1_analysis.sqlite => Plate_3_prime_nf1_analysis.sqlite} (100%) rename 2.cellprofiler_analysis/analysis_output/Plate_4/{nf1_analysis.sqlite => Plate_4_nf1_analysis.sqlite} (100%) create mode 100644 2.cellprofiler_analysis/rename_sqlite_files.ipynb create mode 100644 2.cellprofiler_analysis/scripts/rename_sqlite_files.py diff --git a/2.cellprofiler_analysis/analysis_output/Plate_1/nf1_analysis.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite similarity index 100% rename from 2.cellprofiler_analysis/analysis_output/Plate_1/nf1_analysis.sqlite rename to 2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite diff --git a/2.cellprofiler_analysis/analysis_output/Plate_2/nf1_analysis.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite similarity index 100% rename from 2.cellprofiler_analysis/analysis_output/Plate_2/nf1_analysis.sqlite rename to 2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite diff --git a/2.cellprofiler_analysis/analysis_output/Plate_3/nf1_analysis.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite similarity index 100% rename from 2.cellprofiler_analysis/analysis_output/Plate_3/nf1_analysis.sqlite rename to 2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite diff --git a/2.cellprofiler_analysis/analysis_output/Plate_3_prime/nf1_analysis.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite similarity index 100% rename from 2.cellprofiler_analysis/analysis_output/Plate_3_prime/nf1_analysis.sqlite rename to 2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite diff --git a/2.cellprofiler_analysis/analysis_output/Plate_4/nf1_analysis.sqlite b/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite similarity index 100% rename from 2.cellprofiler_analysis/analysis_output/Plate_4/nf1_analysis.sqlite rename to 2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite diff --git a/2.cellprofiler_analysis/nf1_analysis.sh b/2.cellprofiler_analysis/nf1_analysis.sh index 0129538..c3bf507 100644 --- a/2.cellprofiler_analysis/nf1_analysis.sh +++ b/2.cellprofiler_analysis/nf1_analysis.sh @@ -5,7 +5,9 @@ conda init bash # activate the main conda environment conda activate nf1_cellpainting_data -# convert the notebook into a python and run the file -jupyter nbconvert --to python \ - --FilesWriter.build_directory=scripts/ \ - --execute nf1_analysis.ipynb +# convert all notebooks to python files into the scripts folder +jupyter nbconvert --to python --output-dir=scripts/ *.ipynb + +# run the python scripts in order (CellProfiler analysis then rename SQLite files) +python scripts/nf1_analysis.py +python scripts/rename_sqlite_files.py diff --git a/2.cellprofiler_analysis/rename_sqlite_files.ipynb b/2.cellprofiler_analysis/rename_sqlite_files.ipynb new file mode 100644 index 0000000..6a2853c --- /dev/null +++ b/2.cellprofiler_analysis/rename_sqlite_files.ipynb @@ -0,0 +1,108 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Rename SQLite files from each plate folder to include plate as the prefix\n", + "\n", + "Due to the name of the SQLite file being hardcoded into the pipeline, the work-around when using `CellProfiler Parallel` is to output the SQLite files into folders with the plate name as to avoid conflicts. The files are renamed after analysis to include the plate prefix." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pathlib" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set path to directory with CellProfiler output" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# directory where SQLite files are located in folders per plate\n", + "sqlite_dir = pathlib.Path(\"../2.cellprofiler_analysis/analysis_output/\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add plate prefix to all SQLite files" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Plate name prefix has been added to ../2.cellprofiler_analysis/analysis_output/Plate_4/nf1_analysis.sqlite. The new name is ../2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite.\n", + "Plate name prefix has been added to ../2.cellprofiler_analysis/analysis_output/Plate_1/nf1_analysis.sqlite. The new name is ../2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite.\n", + "Plate name prefix has been added to ../2.cellprofiler_analysis/analysis_output/Plate_3/nf1_analysis.sqlite. The new name is ../2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite.\n", + "Plate name prefix has been added to ../2.cellprofiler_analysis/analysis_output/Plate_2/nf1_analysis.sqlite. The new name is ../2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite.\n", + "Plate name prefix has been added to ../2.cellprofiler_analysis/analysis_output/Plate_3_prime/nf1_analysis.sqlite. The new name is ../2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite.\n" + ] + } + ], + "source": [ + "# iterate through all folders in directory to get paths to each SQLite file\n", + "for file_path in sqlite_dir.rglob('*.sqlite'):\n", + " # if the SQLite files already start with `Plate`, then the file has already been renamed\n", + " if str(file_path).startswith(\"Plate\"):\n", + " print(f\"{file_path.name} already has the `Plate` prefix, which means it was already corrected.\")\n", + " continue\n", + " # create new file name where the folder name is included as the prefix\n", + " new_file_name = f\"{file_path.parent.name}_{file_path.name}\"\n", + " # create a new path with the new name\n", + " new_path = file_path.with_name(new_file_name)\n", + " # rename all SQLite files by using the new path\n", + " file_path.rename(new_path)\n", + " print(f\"Plate name prefix has been added to {file_path}. The new name is {new_path}.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nf1_cellpainting_data", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.15" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/2.cellprofiler_analysis/scripts/rename_sqlite_files.py b/2.cellprofiler_analysis/scripts/rename_sqlite_files.py new file mode 100644 index 0000000..0689fd5 --- /dev/null +++ b/2.cellprofiler_analysis/scripts/rename_sqlite_files.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # Rename SQLite files from each plate folder to include plate as the prefix +# +# Due to the name of the SQLite file being hardcoded into the pipeline, the work-around when using `CellProfiler Parallel` is to output the SQLite files into folders with the plate name as to avoid conflicts. The files are renamed after analysis to include the plate prefix. + +# ## Import libraries + +# In[1]: + + +import pathlib + + +# ## Set path to directory with CellProfiler output + +# In[2]: + + +# directory where SQLite files are located in folders per plate +sqlite_dir = pathlib.Path("../2.cellprofiler_analysis/analysis_output/") + + +# ## Add plate prefix to all SQLite files + +# In[3]: + + +# iterate through all folders in directory to get paths to each SQLite file +for file_path in sqlite_dir.rglob('*.sqlite'): + # if the SQLite files already start with `Plate`, then the file has already been renamed + if str(file_path).startswith("Plate"): + print(f"{file_path.name} already has the `Plate` prefix, which means it was already corrected.") + continue + # create new file name where the folder name is included as the prefix + new_file_name = f"{file_path.parent.name}_{file_path.name}" + # create a new path with the new name + new_path = file_path.with_name(new_file_name) + # rename all SQLite files by using the new path + file_path.rename(new_path) + print(f"Plate name prefix has been added to {file_path}. The new name is {new_path}.") + From bfb1d468c4e5f4e019591d1f24cbaa0508fa558e Mon Sep 17 00:00:00 2001 From: jenna-tomkinson Date: Mon, 17 Jul 2023 13:14:38 -0600 Subject: [PATCH 2/5] add plate 4 and edit notebooks --- .../0.merge_sc_cytotable.ipynb | 382 +++++++++++++-- 3.processing_features/1.annotate_sc.ipynb | 436 +++++++++--------- 3.processing_features/2.normalize_sc.ipynb | 381 ++++++++------- .../3.feature_select_sc.ipynb | 423 ++++++++--------- .../scripts/0.merge_sc_cytotable.py | 81 ++-- .../scripts/1.annotate_sc.py | 60 ++- .../scripts/2.normalize_sc.py | 20 +- .../scripts/3.feature_select_sc.py | 13 +- 8 files changed, 1054 insertions(+), 742 deletions(-) diff --git a/3.processing_features/0.merge_sc_cytotable.ipynb b/3.processing_features/0.merge_sc_cytotable.ipynb index d17ff19..303e148 100644 --- a/3.processing_features/0.merge_sc_cytotable.ipynb +++ b/3.processing_features/0.merge_sc_cytotable.ipynb @@ -25,6 +25,8 @@ "import sys\n", "import pathlib\n", "import yaml\n", + "import pprint\n", + "import pandas as pd\n", "\n", "# cytotable will merge objects from SQLite file into single cells and save as parquet file\n", "from cytotable import convert, presets\n", @@ -50,6 +52,20 @@ "# type of file output from CytoTable (currently only parquet)\n", "dest_datatype = \"parquet\"\n", "\n", + "# set main output dir for all parquet files\n", + "output_dir = pathlib.Path(\"./data/converted_data/\")\n", + "output_dir.mkdir(exist_ok=True)\n", + "\n", + "# directory where SQLite files are located\n", + "sqlite_dir = pathlib.Path(\"../2.cellprofiler_analysis/analysis_output/\")\n", + "\n", + "# list for plate names based on folders to use to create dictionary\n", + "plate_names = []\n", + "# iterate through 0.download_data and append plate names from folder names that contain image data from that plate\n", + "for file_path in pathlib.Path(\"../0.download_data/\").iterdir():\n", + " if str(file_path.stem).startswith(\"Plate\"):\n", + " plate_names.append(str(file_path.stem))\n", + "\n", "# preset configurations based on typical CellProfiler outputs\n", "preset = \"cellprofiler_sqlite_pycytominer\"\n", "# remove Image_Metadata_Plate from SELECT as this metadata was not extracted from file names\n", @@ -82,54 +98,51 @@ " \"\"\"" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create dictionary with info for each plate\n", + "\n", + "**Note:** All paths must be string to use with CytoTable." + ] + }, { "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{ 'Plate_1': { 'dest_path': 'data/converted_data/Plate_1.parquet',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},\n", + " 'Plate_2': { 'dest_path': 'data/converted_data/Plate_2.parquet',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},\n", + " 'Plate_3': { 'dest_path': 'data/converted_data/Plate_3.parquet',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},\n", + " 'Plate_3_prime': { 'dest_path': 'data/converted_data/Plate_3_prime.parquet',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},\n", + " 'Plate_4': { 'dest_path': 'data/converted_data/Plate_4.parquet',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite'}}\n" + ] + } + ], "source": [ - "# dictionary with info for the sqlite file from each plate\n", + "# create plate info dictionary with all parts of the CellProfiler CLI command to run in parallel\n", "plate_info_dictionary = {\n", - " \"Plate_1\": {\n", - " # path to outputted SQLite file\n", - " \"source_path\": str(\n", - " pathlib.Path(\n", - " \"../2.cellprofiler_analysis/analysis_output/Plate_1.sqlite\"\n", - " )\n", - " ),\n", - " \"dest_path\": str(pathlib.Path(f\"./data/converted_data/Plate_1.parquet\")),\n", - " },\n", - " \"Plate_2\": {\n", - " # path to outputted SQLite file\n", - " \"source_path\": str(\n", - " pathlib.Path(\n", - " \"../2.cellprofiler_analysis/analysis_output/Plate_2.sqlite\"\n", - " )\n", - " ),\n", - " # path for merged single cell paraquet file (without annotations)\n", - " \"dest_path\": str(pathlib.Path(f\"./data/converted_data/Plate_2.parquet\")),\n", - " },\n", - " \"Plate_3\": {\n", - " # path to outputted SQLite file\n", - " \"source_path\": str(\n", - " pathlib.Path(\n", - " \"../2.cellprofiler_analysis/analysis_output/Plate_3.sqlite\"\n", - " )\n", - " ),\n", - " # path for merged single cell paraquet file (without annotations)\n", - " \"dest_path\": str(pathlib.Path(f\"./data/converted_data/Plate_3.parquet\")),\n", - " },\n", - " \"Plate_3_prime\": {\n", - " # path to outputted SQLite file\n", - " \"source_path\": str(\n", - " pathlib.Path(\n", - " \"../2.cellprofiler_analysis/analysis_output/Plate_3_prime.sqlite\"\n", - " )\n", - " ),\n", - " # path for merged single cell paraquet file (without annotations)\n", - " \"dest_path\": str(pathlib.Path(f\"./data/converted_data/Plate_3_prime.parquet\")),\n", + " name: {\n", + " \"source_path\": str(pathlib.Path(\n", + " list(sqlite_dir.rglob(f\"{name}_nf1_analysis.sqlite\"))[0]\n", + " ).resolve(strict=True)),\n", + " \"dest_path\": str(pathlib.Path(f\"{output_dir}/{name}.parquet\")),\n", " }\n", - "}" + " for name in plate_names\n", + "}\n", + "\n", + "# view the dictionary to assess that all info is added correctly\n", + "pprint.pprint(plate_info_dictionary, indent=4)" ] }, { @@ -149,15 +162,18 @@ "name": "stdout", "output_type": "stream", "text": [ + "Performing merge single cells and conversion on Plate_4!\n", + "Merged and converted Plate_4.parquet!\n", + "Added single cell count as metadata to Plate_4.parquet!\n", "Performing merge single cells and conversion on Plate_1!\n", "Merged and converted Plate_1.parquet!\n", "Added single cell count as metadata to Plate_1.parquet!\n", - "Performing merge single cells and conversion on Plate_2!\n", - "Merged and converted Plate_2.parquet!\n", - "Added single cell count as metadata to Plate_2.parquet!\n", "Performing merge single cells and conversion on Plate_3!\n", "Merged and converted Plate_3.parquet!\n", "Added single cell count as metadata to Plate_3.parquet!\n", + "Performing merge single cells and conversion on Plate_2!\n", + "Merged and converted Plate_2.parquet!\n", + "Added single cell count as metadata to Plate_2.parquet!\n", "Performing merge single cells and conversion on Plate_3_prime!\n", "Merged and converted Plate_3_prime.parquet!\n", "Added single cell count as metadata to Plate_3_prime.parquet!\n" @@ -187,6 +203,282 @@ " print(f\"Added single cell count as metadata to {pathlib.Path(dest_path).name}!\")" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(7502, 1592)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Metadata_ImageNumberImage_Metadata_SiteMetadata_number_of_singlecellsImage_Metadata_WellMetadata_Cells_Number_Object_NumberMetadata_Cytoplasm_Parent_CellsMetadata_Cytoplasm_Parent_NucleiMetadata_Nuclei_Number_Object_NumberCytoplasm_AreaShape_AreaCytoplasm_AreaShape_BoundingBoxArea...Nuclei_Texture_Variance_DAPI_3_02_256Nuclei_Texture_Variance_DAPI_3_03_256Nuclei_Texture_Variance_GFP_3_00_256Nuclei_Texture_Variance_GFP_3_01_256Nuclei_Texture_Variance_GFP_3_02_256Nuclei_Texture_Variance_GFP_3_03_256Nuclei_Texture_Variance_RFP_3_00_256Nuclei_Texture_Variance_RFP_3_01_256Nuclei_Texture_Variance_RFP_3_02_256Nuclei_Texture_Variance_RFP_3_03_256
011081B10445522157.055566.0...1281.8741861257.43576165.96569552.06822250.44578051.851812425.319446409.351012418.021018425.068520
111081B10556611718.033891.0...1085.7504601113.144205139.037112140.802921141.819546149.091779512.879573499.756267513.447060507.419635
221181B10111117501.052866.0...1273.4287211246.970723137.466776111.514400113.076080118.810204311.232220306.768555302.693276298.429977
321181B10222217871.088250.0...633.124457642.170387190.690537173.126428170.503677178.200219401.039364412.623493420.041994402.738604
421181B10333312098.042926.0...894.732816829.273862142.997128131.232052126.981214128.412295357.660331351.831903351.386580357.795596
\n", + "

5 rows × 1592 columns

\n", + "
" + ], + "text/plain": [ + " Metadata_ImageNumber Image_Metadata_Site Metadata_number_of_singlecells \n", + "0 1 10 81 \\\n", + "1 1 10 81 \n", + "2 2 11 81 \n", + "3 2 11 81 \n", + "4 2 11 81 \n", + "\n", + " Image_Metadata_Well Metadata_Cells_Number_Object_Number \n", + "0 B10 4 \\\n", + "1 B10 5 \n", + "2 B10 1 \n", + "3 B10 2 \n", + "4 B10 3 \n", + "\n", + " Metadata_Cytoplasm_Parent_Cells Metadata_Cytoplasm_Parent_Nuclei \n", + "0 4 5 \\\n", + "1 5 6 \n", + "2 1 1 \n", + "3 2 2 \n", + "4 3 3 \n", + "\n", + " Metadata_Nuclei_Number_Object_Number Cytoplasm_AreaShape_Area \n", + "0 5 22157.0 \\\n", + "1 6 11718.0 \n", + "2 1 17501.0 \n", + "3 2 17871.0 \n", + "4 3 12098.0 \n", + "\n", + " Cytoplasm_AreaShape_BoundingBoxArea ... \n", + "0 55566.0 ... \\\n", + "1 33891.0 ... \n", + "2 52866.0 ... \n", + "3 88250.0 ... \n", + "4 42926.0 ... \n", + "\n", + " Nuclei_Texture_Variance_DAPI_3_02_256 \n", + "0 1281.874186 \\\n", + "1 1085.750460 \n", + "2 1273.428721 \n", + "3 633.124457 \n", + "4 894.732816 \n", + "\n", + " Nuclei_Texture_Variance_DAPI_3_03_256 \n", + "0 1257.435761 \\\n", + "1 1113.144205 \n", + "2 1246.970723 \n", + "3 642.170387 \n", + "4 829.273862 \n", + "\n", + " Nuclei_Texture_Variance_GFP_3_00_256 Nuclei_Texture_Variance_GFP_3_01_256 \n", + "0 65.965695 52.068222 \\\n", + "1 139.037112 140.802921 \n", + "2 137.466776 111.514400 \n", + "3 190.690537 173.126428 \n", + "4 142.997128 131.232052 \n", + "\n", + " Nuclei_Texture_Variance_GFP_3_02_256 Nuclei_Texture_Variance_GFP_3_03_256 \n", + "0 50.445780 51.851812 \\\n", + "1 141.819546 149.091779 \n", + "2 113.076080 118.810204 \n", + "3 170.503677 178.200219 \n", + "4 126.981214 128.412295 \n", + "\n", + " Nuclei_Texture_Variance_RFP_3_00_256 Nuclei_Texture_Variance_RFP_3_01_256 \n", + "0 425.319446 409.351012 \\\n", + "1 512.879573 499.756267 \n", + "2 311.232220 306.768555 \n", + "3 401.039364 412.623493 \n", + "4 357.660331 351.831903 \n", + "\n", + " Nuclei_Texture_Variance_RFP_3_02_256 Nuclei_Texture_Variance_RFP_3_03_256 \n", + "0 418.021018 425.068520 \n", + "1 513.447060 507.419635 \n", + "2 302.693276 298.429977 \n", + "3 420.041994 402.738604 \n", + "4 351.386580 357.795596 \n", + "\n", + "[5 rows x 1592 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "converted_df = pd.read_parquet(plate_info_dictionary[\"Plate_4\"][\"dest_path\"])\n", + "\n", + "# load in and print a converted df to see if it looks correct\n", + "print(converted_df.shape)\n", + "converted_df.head()" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -197,7 +489,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ diff --git a/3.processing_features/1.annotate_sc.ipynb b/3.processing_features/1.annotate_sc.ipynb index 44007a9..ce2f2c2 100644 --- a/3.processing_features/1.annotate_sc.ipynb +++ b/3.processing_features/1.annotate_sc.ipynb @@ -24,9 +24,8 @@ "source": [ "import sys\n", "import pathlib\n", - "import os\n", "import yaml\n", - "import json\n", + "import pprint\n", "\n", "import pandas as pd\n", "from pycytominer import annotate\n", @@ -48,64 +47,72 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, + "outputs": [], + "source": [ + "# output directory for annotated data\n", + "output_dir = pathlib.Path(\"./data/annotated_data\")\n", + "output_dir.mkdir(exist_ok=True)\n", + "\n", + "# directory with metadata\n", + "metadata_dir = pathlib.Path(\"../0.download_data/metadata/\")\n", + "\n", + "# load in dicionary from yaml file\n", + "dictionary_path = pathlib.Path(\"./plate_info_dictionary.yaml\")\n", + "with open(dictionary_path) as file:\n", + " plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add metadata paths to loaded in dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{\n", - " \"Plate_1\": {\n", - " \"annotated_path\": \"data/annotated_data/Plate_1_sc.parquet\",\n", - " \"dest_path\": \"data/converted_data/Plate_1.parquet\",\n", - " \"normalized_path\": \"data/normalized_data/Plate_1_sc_norm.parquet\",\n", - " \"platemap_path\": \"../0.download_data/metadata/platemap_NF1_plate1.csv\",\n", - " \"source_path\": \"../2.cellprofiler_analysis/analysis_output/Plate_1.sqlite\"\n", - " },\n", - " \"Plate_2\": {\n", - " \"annotated_path\": \"data/annotated_data/Plate_2_sc.parquet\",\n", - " \"dest_path\": \"data/converted_data/Plate_2.parquet\",\n", - " \"normalized_path\": \"data/normalized_data/Plate_2_sc_norm.parquet\",\n", - " \"platemap_path\": \"../0.download_data/metadata/platemap_NF1_plate2.csv\",\n", - " \"source_path\": \"../2.cellprofiler_analysis/analysis_output/Plate_2.sqlite\"\n", - " },\n", - " \"Plate_3\": {\n", - " \"annotated_path\": \"data/annotated_data/Plate_3_sc.parquet\",\n", - " \"dest_path\": \"data/converted_data/Plate_3.parquet\",\n", - " \"normalized_path\": \"data/normalized_data/Plate_3_sc_norm.parquet\",\n", - " \"platemap_path\": \"../0.download_data/metadata/platemap_NF1_plate3.csv\",\n", - " \"source_path\": \"../2.cellprofiler_analysis/analysis_output/Plate_3.sqlite\"\n", - " },\n", - " \"Plate_3_prime\": {\n", - " \"annotated_path\": \"data/annotated_data/Plate_3_prime_sc.parquet\",\n", - " \"dest_path\": \"data/converted_data/Plate_3_prime.parquet\",\n", - " \"normalized_path\": \"data/normalized_data/Plate_3_prime_sc_norm.parquet\",\n", - " \"platemap_path\": \"../0.download_data/metadata/platemap_NF1_plate3.csv\",\n", - " \"source_path\": \"../2.cellprofiler_analysis/analysis_output/Plate_3_prime.sqlite\"\n", - " }\n", - "}\n" + "{ 'Plate_1': { 'dest_path': 'data/converted_data/Plate_1.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate1.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},\n", + " 'Plate_2': { 'dest_path': 'data/converted_data/Plate_2.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate2.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},\n", + " 'Plate_3': { 'dest_path': 'data/converted_data/Plate_3.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},\n", + " 'Plate_3_prime': { 'dest_path': 'data/converted_data/Plate_3_prime.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},\n", + " 'Plate_4': { 'dest_path': 'data/converted_data/Plate_4.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate4.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite'}}\n" ] } ], "source": [ - "# output directory for annotated data\n", - "output_dir = pathlib.Path(\"./data/annotated_data\")\n", - "# if directory if doesn't exist, will not raise error if it already exists\n", - "os.makedirs(output_dir, exist_ok=True)\n", - "\n", - "# load in dicionary from yaml file\n", - "dictionary_path = pathlib.Path(\"./plate_info_dictionary.yaml\")\n", - "with open(dictionary_path) as file:\n", - " plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)\n", - "\n", - "# add paths to dictionary that are used for annotation\n", - "plate_info_dictionary[\"Plate_1\"][\"platemap_path\"] = str(pathlib.Path(\"../0.download_data/metadata/platemap_NF1_plate1.csv\"))\n", - "plate_info_dictionary[\"Plate_2\"][\"platemap_path\"] = str(pathlib.Path(\"../0.download_data/metadata/platemap_NF1_plate2.csv\"))\n", - "# both plates 3 and 3 prime use the same platemap file (same metadata)\n", - "plate_info_dictionary[\"Plate_3\"][\"platemap_path\"] = str(pathlib.Path(\"../0.download_data/metadata/platemap_NF1_plate3.csv\"))\n", - "plate_info_dictionary[\"Plate_3_prime\"][\"platemap_path\"] = str(pathlib.Path(\"../0.download_data/metadata/platemap_NF1_plate3.csv\"))\n", + "# add path to platemaps for each plate \n", + "for plate, _ in plate_info_dictionary.items():\n", + " # since Plate_3_prime has the same platemap as Plate_3, we need an else statement so that we make sure it adds the \n", + " # path that was given to Plate_3\n", + " if plate != \"Plate_3_prime\":\n", + " # match the naming format of the plates to the platemap file\n", + " plate_info_dictionary[plate][\"platemap_path\"] = str(\n", + " pathlib.Path(list(metadata_dir.rglob(f\"platemap_NF1_{plate.replace('_', '').lower()}.csv\"))[0]).resolve(\n", + " strict=True\n", + " )\n", + " )\n", + " else:\n", + " plate_info_dictionary[\"Plate_3_prime\"][\"platemap_path\"] = plate_info_dictionary[\"Plate_3\"][\"platemap_path\"]\n", "\n", "# view the dictionary to assess that all info is added correctly\n", - "print(json.dumps(plate_info_dictionary, indent=4))" + "pprint.pprint(plate_info_dictionary, indent=4)" ] }, { @@ -113,12 +120,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Annotate merged single cells" + "## Annotate merged single cells\n", + "\n", + "**Note:** The path to the annotated file to be used for normalization is adding during this step." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -132,7 +141,9 @@ "Adding annotations to merged single cells for Plate_3!\n", "Annotations have been added to Plate_3 and saved!\n", "Adding annotations to merged single cells for Plate_3_prime!\n", - "Annotations have been added to Plate_3_prime and saved!\n" + "Annotations have been added to Plate_3_prime and saved!\n", + "Adding annotations to merged single cells for Plate_4!\n", + "Annotations have been added to Plate_4 and saved!\n" ] } ], @@ -153,12 +164,18 @@ " join_on=[\"Metadata_well_position\", \"Image_Metadata_Well\"],\n", " )\n", "\n", - " # move metadata well and single cell count to the front of the df (for easy visualization in python)\n", + " # rename site column to avoid any issues with identifying the column as metadata over feature\n", + " annotated_df = annotated_df.rename(columns={\"Image_Metadata_Site\": \"Metadata_Site\"})\n", + "\n", + " # move metadata well, single cell count, and site to the front of the df (for easy visualization in python)\n", " well_column = annotated_df.pop(\"Metadata_Well\")\n", " singlecell_column = annotated_df.pop(\"Metadata_number_of_singlecells\")\n", - " # insert the column as the second index column in the dataframe\n", - " annotated_df.insert(1, \"Metadata_Well\", well_column)\n", - " annotated_df.insert(2, \"Metadata_number_of_singlecells\", singlecell_column)\n", + " site_column = annotated_df.pop(\"Metadata_Site\") \n", + "\n", + " # insert the columns in specific parts of the dataframe\n", + " annotated_df.insert(2, \"Metadata_Well\", well_column)\n", + " annotated_df.insert(3, \"Metadata_Site\", site_column)\n", + " annotated_df.insert(4, \"Metadata_number_of_singlecells\", singlecell_column)\n", "\n", " # save annotated df as parquet file\n", " output(\n", @@ -171,14 +188,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(14495, 1597)\n" + "(7502, 1600)\n" ] }, { @@ -203,15 +220,15 @@ " \n", " \n", " Metadata_WellRow\n", + " Metadata_WellCol\n", " Metadata_Well\n", + " Metadata_Site\n", " Metadata_number_of_singlecells\n", - " Metadata_WellCol\n", " Metadata_gene_name\n", " Metadata_genotype\n", " Metadata_seed_density\n", - " Metadata_ImageNumber\n", - " Metadata_Cells_Number_Object_Number\n", - " Metadata_Cytoplasm_Parent_Cells\n", + " Metadata_siRNA\n", + " Metadata_RNAiMax\n", " ...\n", " Nuclei_Texture_Variance_DAPI_3_02_256\n", " Nuclei_Texture_Variance_DAPI_3_03_256\n", @@ -229,203 +246,196 @@ " \n", " 0\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 11\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 78\n", - " 1\n", - " 1\n", + " 1000\n", + " NaN\n", + " 0\n", " ...\n", - " 2278.660622\n", - " 2216.518209\n", - " 478.722598\n", - " 449.485799\n", - " 464.175505\n", - " 475.434688\n", - " 74.009363\n", - " 72.723123\n", - " 76.103825\n", - " 76.127622\n", + " 886.555259\n", + " 887.126996\n", + " 220.059165\n", + " 178.494944\n", + " 177.175567\n", + " 178.536364\n", + " 1947.689117\n", + " 1942.505237\n", + " 1929.883349\n", + " 1954.302782\n", " \n", " \n", " 1\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 11\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 81\n", - " 2\n", - " 2\n", + " 1000\n", + " NaN\n", + " 0\n", " ...\n", - " 720.731162\n", - " 722.051788\n", - " 121.540870\n", - " 129.189742\n", - " 124.312256\n", - " 123.659245\n", - " 194.556687\n", - " 194.525087\n", - " 188.947644\n", - " 189.180229\n", + " 2269.739602\n", + " 2183.665041\n", + " 197.803849\n", + " 181.889009\n", + " 190.645422\n", + " 197.228087\n", + " 224.801646\n", + " 219.118772\n", + " 235.706020\n", + " 228.832375\n", " \n", " \n", " 2\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 11\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 82\n", - " 1\n", - " 1\n", + " 1000\n", + " NaN\n", + " 0\n", " ...\n", - " 2464.118189\n", - " 2568.315137\n", - " 591.215759\n", - " 470.246357\n", - " 453.591037\n", - " 455.722645\n", - " 161.361597\n", - " 148.654973\n", - " 145.224151\n", - " 148.543595\n", + " 993.492352\n", + " 951.907468\n", + " 424.891364\n", + " 390.009779\n", + " 381.521734\n", + " 390.667328\n", + " 623.836472\n", + " 616.211140\n", + " 661.626364\n", + " 636.520133\n", " \n", " \n", " 3\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 11\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 82\n", - " 2\n", - " 2\n", + " 1000\n", + " NaN\n", + " 0\n", " ...\n", - " 2886.052228\n", - " 3117.874708\n", - " 572.970287\n", - " 539.569923\n", - " 537.033409\n", - " 476.209479\n", - " 162.939002\n", - " 160.198123\n", - " 164.586236\n", - " 155.469083\n", + " 2636.424708\n", + " 2517.017120\n", + " 561.624256\n", + " 544.408328\n", + " 548.657987\n", + " 556.701996\n", + " 364.792509\n", + " 365.727202\n", + " 437.796494\n", + " 380.910214\n", " \n", " \n", " 4\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 14\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 83\n", - " 1\n", - " 1\n", + " 1000\n", + " NaN\n", + " 0\n", " ...\n", - " 725.579676\n", - " 707.478879\n", - " 305.197153\n", - " 305.705155\n", - " 294.205789\n", - " 301.477588\n", - " 97.254067\n", - " 95.867598\n", - " 95.624585\n", - " 95.350038\n", + " 652.339728\n", + " 635.869580\n", + " 126.833034\n", + " 120.283432\n", + " 109.813358\n", + " 109.301294\n", + " 299.201582\n", + " 297.223402\n", + " 320.864494\n", + " 303.344477\n", " \n", " \n", "\n", - "

5 rows × 1597 columns

\n", + "

5 rows × 1600 columns

\n", "" ], "text/plain": [ - " Metadata_WellRow Metadata_Well Metadata_number_of_singlecells \\\n", - "0 B B1 42 \n", - "1 B B1 42 \n", - "2 B B1 42 \n", - "3 B B1 42 \n", - "4 B B1 42 \n", - "\n", - " Metadata_WellCol Metadata_gene_name Metadata_genotype \\\n", - "0 1 NF1 WT \n", - "1 1 NF1 WT \n", - "2 1 NF1 WT \n", - "3 1 NF1 WT \n", - "4 1 NF1 WT \n", + " Metadata_WellRow Metadata_WellCol Metadata_Well Metadata_Site \n", + "0 B 2 B2 11 \\\n", + "1 B 2 B2 11 \n", + "2 B 2 B2 11 \n", + "3 B 2 B2 11 \n", + "4 B 2 B2 14 \n", "\n", - " Metadata_seed_density Metadata_ImageNumber \\\n", - "0 500 78 \n", - "1 500 81 \n", - "2 500 82 \n", - "3 500 82 \n", - "4 500 83 \n", + " Metadata_number_of_singlecells Metadata_gene_name Metadata_genotype \n", + "0 115 NF1 WT \\\n", + "1 115 NF1 WT \n", + "2 115 NF1 WT \n", + "3 115 NF1 WT \n", + "4 115 NF1 WT \n", "\n", - " Metadata_Cells_Number_Object_Number Metadata_Cytoplasm_Parent_Cells ... \\\n", - "0 1 1 ... \n", - "1 2 2 ... \n", - "2 1 1 ... \n", - "3 2 2 ... \n", - "4 1 1 ... \n", + " Metadata_seed_density Metadata_siRNA Metadata_RNAiMax ... \n", + "0 1000 NaN 0 ... \\\n", + "1 1000 NaN 0 ... \n", + "2 1000 NaN 0 ... \n", + "3 1000 NaN 0 ... \n", + "4 1000 NaN 0 ... \n", "\n", - " Nuclei_Texture_Variance_DAPI_3_02_256 \\\n", - "0 2278.660622 \n", - "1 720.731162 \n", - "2 2464.118189 \n", - "3 2886.052228 \n", - "4 725.579676 \n", + " Nuclei_Texture_Variance_DAPI_3_02_256 \n", + "0 886.555259 \\\n", + "1 2269.739602 \n", + "2 993.492352 \n", + "3 2636.424708 \n", + "4 652.339728 \n", "\n", - " Nuclei_Texture_Variance_DAPI_3_03_256 Nuclei_Texture_Variance_GFP_3_00_256 \\\n", - "0 2216.518209 478.722598 \n", - "1 722.051788 121.540870 \n", - "2 2568.315137 591.215759 \n", - "3 3117.874708 572.970287 \n", - "4 707.478879 305.197153 \n", + " Nuclei_Texture_Variance_DAPI_3_03_256 \n", + "0 887.126996 \\\n", + "1 2183.665041 \n", + "2 951.907468 \n", + "3 2517.017120 \n", + "4 635.869580 \n", "\n", - " Nuclei_Texture_Variance_GFP_3_01_256 Nuclei_Texture_Variance_GFP_3_02_256 \\\n", - "0 449.485799 464.175505 \n", - "1 129.189742 124.312256 \n", - "2 470.246357 453.591037 \n", - "3 539.569923 537.033409 \n", - "4 305.705155 294.205789 \n", + " Nuclei_Texture_Variance_GFP_3_00_256 Nuclei_Texture_Variance_GFP_3_01_256 \n", + "0 220.059165 178.494944 \\\n", + "1 197.803849 181.889009 \n", + "2 424.891364 390.009779 \n", + "3 561.624256 544.408328 \n", + "4 126.833034 120.283432 \n", "\n", - " Nuclei_Texture_Variance_GFP_3_03_256 Nuclei_Texture_Variance_RFP_3_00_256 \\\n", - "0 475.434688 74.009363 \n", - "1 123.659245 194.556687 \n", - "2 455.722645 161.361597 \n", - "3 476.209479 162.939002 \n", - "4 301.477588 97.254067 \n", + " Nuclei_Texture_Variance_GFP_3_02_256 Nuclei_Texture_Variance_GFP_3_03_256 \n", + "0 177.175567 178.536364 \\\n", + "1 190.645422 197.228087 \n", + "2 381.521734 390.667328 \n", + "3 548.657987 556.701996 \n", + "4 109.813358 109.301294 \n", "\n", - " Nuclei_Texture_Variance_RFP_3_01_256 Nuclei_Texture_Variance_RFP_3_02_256 \\\n", - "0 72.723123 76.103825 \n", - "1 194.525087 188.947644 \n", - "2 148.654973 145.224151 \n", - "3 160.198123 164.586236 \n", - "4 95.867598 95.624585 \n", + " Nuclei_Texture_Variance_RFP_3_00_256 Nuclei_Texture_Variance_RFP_3_01_256 \n", + "0 1947.689117 1942.505237 \\\n", + "1 224.801646 219.118772 \n", + "2 623.836472 616.211140 \n", + "3 364.792509 365.727202 \n", + "4 299.201582 297.223402 \n", "\n", - " Nuclei_Texture_Variance_RFP_3_03_256 \n", - "0 76.127622 \n", - "1 189.180229 \n", - "2 148.543595 \n", - "3 155.469083 \n", - "4 95.350038 \n", + " Nuclei_Texture_Variance_RFP_3_02_256 Nuclei_Texture_Variance_RFP_3_03_256 \n", + "0 1929.883349 1954.302782 \n", + "1 235.706020 228.832375 \n", + "2 661.626364 636.520133 \n", + "3 437.796494 380.910214 \n", + "4 320.864494 303.344477 \n", "\n", - "[5 rows x 1597 columns]" + "[5 rows x 1600 columns]" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -446,11 +456,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "with open(dictionary_path, 'w') as file:\n", + "with open(dictionary_path, \"w\") as file:\n", " yaml.dump(plate_info_dictionary, file)" ] } diff --git a/3.processing_features/2.normalize_sc.ipynb b/3.processing_features/2.normalize_sc.ipynb index caa752f..bdbba51 100644 --- a/3.processing_features/2.normalize_sc.ipynb +++ b/3.processing_features/2.normalize_sc.ipynb @@ -22,17 +22,22 @@ "metadata": {}, "outputs": [], "source": [ - "import sys\n", "import pathlib\n", - "import os\n", "import yaml\n", - "import json\n", + "import pprint\n", "\n", "import pandas as pd\n", "from pycytominer import normalize\n", "from pycytominer.cyto_utils import output" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set paths and load in dictionary from annotated run" + ] + }, { "cell_type": "code", "execution_count": 2, @@ -42,52 +47,50 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\n", - " \"Plate_1\": {\n", - " \"annotated_path\": \"data/annotated_data/Plate_1_sc.parquet\",\n", - " \"dest_path\": \"data/converted_data/Plate_1.parquet\",\n", - " \"normalized_path\": \"data/normalized_data/Plate_1_sc_norm.parquet\",\n", - " \"platemap_path\": \"../0.download_data/metadata/platemap_NF1_plate1.csv\",\n", - " \"source_path\": \"../2.cellprofiler_analysis/analysis_output/Plate_1.sqlite\"\n", - " },\n", - " \"Plate_2\": {\n", - " \"annotated_path\": \"data/annotated_data/Plate_2_sc.parquet\",\n", - " \"dest_path\": \"data/converted_data/Plate_2.parquet\",\n", - " \"normalized_path\": \"data/normalized_data/Plate_2_sc_norm.parquet\",\n", - " \"platemap_path\": \"../0.download_data/metadata/platemap_NF1_plate2.csv\",\n", - " \"source_path\": \"../2.cellprofiler_analysis/analysis_output/Plate_2.sqlite\"\n", - " },\n", - " \"Plate_3\": {\n", - " \"annotated_path\": \"data/annotated_data/Plate_3_sc.parquet\",\n", - " \"dest_path\": \"data/converted_data/Plate_3.parquet\",\n", - " \"normalized_path\": \"data/normalized_data/Plate_3_sc_norm.parquet\",\n", - " \"platemap_path\": \"../0.download_data/metadata/platemap_NF1_plate3.csv\",\n", - " \"source_path\": \"../2.cellprofiler_analysis/analysis_output/Plate_3.sqlite\"\n", - " },\n", - " \"Plate_3_prime\": {\n", - " \"annotated_path\": \"data/annotated_data/Plate_3_prime_sc.parquet\",\n", - " \"dest_path\": \"data/converted_data/Plate_3_prime.parquet\",\n", - " \"normalized_path\": \"data/normalized_data/Plate_3_prime_sc_norm.parquet\",\n", - " \"platemap_path\": \"../0.download_data/metadata/platemap_NF1_plate3.csv\",\n", - " \"source_path\": \"../2.cellprofiler_analysis/analysis_output/Plate_3_prime.sqlite\"\n", - " }\n", - "}\n" + "{'Plate_1': {'annotated_path': 'data/annotated_data/Plate_1_sc.parquet',\n", + " 'dest_path': 'data/converted_data/Plate_1.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate1.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},\n", + " 'Plate_2': {'annotated_path': 'data/annotated_data/Plate_2_sc.parquet',\n", + " 'dest_path': 'data/converted_data/Plate_2.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate2.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},\n", + " 'Plate_3': {'annotated_path': 'data/annotated_data/Plate_3_sc.parquet',\n", + " 'dest_path': 'data/converted_data/Plate_3.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},\n", + " 'Plate_3_prime': {'annotated_path': 'data/annotated_data/Plate_3_prime_sc.parquet',\n", + " 'dest_path': 'data/converted_data/Plate_3_prime.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},\n", + " 'Plate_4': {'annotated_path': 'data/annotated_data/Plate_4_sc.parquet',\n", + " 'dest_path': 'data/converted_data/Plate_4.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate4.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite'}}\n" ] } ], "source": [ "# output directory for normalized data\n", "output_dir = pathlib.Path(\"./data/normalized_data\")\n", - "# if directory if doesn't exist, will not raise error if it already exists\n", - "os.makedirs(output_dir, exist_ok=True)\n", + "output_dir.mkdir(exist_ok=True)\n", "\n", "# load in dicionary from yaml file\n", "dictionary_path = pathlib.Path(\"./plate_info_dictionary.yaml\")\n", "with open(dictionary_path) as file:\n", " plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)\n", "\n", - "# view the dictionary to confirm all info is included to use for normalization\n", - "print(json.dumps(plate_info_dictionary, indent=4))" + "# view the dictionary to assess that all info is added correctly\n", + "pprint.pprint(plate_info_dictionary)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Normalize annotated single cells from each plate\n", + "\n", + "**Note:** Path to normalized data for each plate is added to the dictionary in this step to be used during feature selection." ] }, { @@ -106,7 +109,9 @@ "Normalizing annotated merged single cells for Plate_3!\n", "Single cells have been normalized for Plate_3 and saved!\n", "Normalizing annotated merged single cells for Plate_3_prime!\n", - "Single cells have been normalized for Plate_3_prime and saved!\n" + "Single cells have been normalized for Plate_3_prime and saved!\n", + "Normalizing annotated merged single cells for Plate_4!\n", + "Single cells have been normalized for Plate_4 and saved!\n" ] } ], @@ -114,8 +119,9 @@ "# process each run\n", "for plate, info in plate_info_dictionary.items():\n", " annotated_df = pd.read_parquet(info[\"annotated_path\"])\n", + " # set output path and add to the dictionary\n", " output_file = str(pathlib.Path(f\"{output_dir}/{plate}_sc_norm.parquet\"))\n", - " # save path to annotated file to dictionary for downstream use\n", + " # save path to normalized file to dictionary for downstream use\n", " plate_info_dictionary[plate][\"normalized_path\"] = output_file\n", " print(f\"Normalizing annotated merged single cells for {plate}!\")\n", "\n", @@ -145,7 +151,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(14495, 1596)\n" + "(7502, 1600)\n" ] }, { @@ -170,15 +176,15 @@ " \n", " \n", " Metadata_WellRow\n", + " Metadata_WellCol\n", " Metadata_Well\n", + " Metadata_Site\n", " Metadata_number_of_singlecells\n", - " Metadata_WellCol\n", " Metadata_gene_name\n", " Metadata_genotype\n", " Metadata_seed_density\n", - " Metadata_ImageNumber\n", - " Metadata_Cells_Number_Object_Number\n", - " Metadata_Cytoplasm_Parent_Cells\n", + " Metadata_siRNA\n", + " Metadata_RNAiMax\n", " ...\n", " Nuclei_Texture_Variance_DAPI_3_02_256\n", " Nuclei_Texture_Variance_DAPI_3_03_256\n", @@ -196,200 +202,193 @@ " \n", " 0\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 11\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 78\n", - " 1\n", - " 1\n", + " 1000\n", + " None\n", + " 0\n", " ...\n", - " 1.256030\n", - " 1.227913\n", - " 0.531897\n", - " 0.487729\n", - " 0.498280\n", - " 0.547909\n", - " -0.751284\n", - " -0.750040\n", - " -0.746242\n", - " -0.742364\n", + " -1.015567\n", + " -0.991013\n", + " -0.427085\n", + " -0.492375\n", + " -0.505686\n", + " -0.492483\n", + " 3.287944\n", + " 3.324396\n", + " 3.234285\n", + " 3.337852\n", " \n", " \n", " 1\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 11\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 81\n", - " 2\n", - " 2\n", + " 1000\n", + " None\n", + " 0\n", " ...\n", - " -0.649737\n", - " -0.635740\n", - " -0.267391\n", - " -0.243767\n", - " -0.262017\n", - " -0.255550\n", - " -0.500182\n", - " -0.494528\n", - " -0.511217\n", - " -0.505461\n", + " 1.254595\n", + " 1.181604\n", + " -0.467826\n", + " -0.486035\n", + " -0.481063\n", + " -0.457716\n", + " -0.661907\n", + " -0.668559\n", + " -0.634783\n", + " -0.644279\n", " \n", " \n", " 2\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 11\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 82\n", - " 1\n", - " 1\n", + " 1000\n", + " None\n", + " 0\n", " ...\n", - " 1.482894\n", - " 1.666617\n", - " 0.783631\n", - " 0.535143\n", - " 0.474602\n", - " 0.502886\n", - " -0.569328\n", - " -0.590753\n", - " -0.602282\n", - " -0.590615\n", + " -0.840056\n", + " -0.882460\n", + " -0.052123\n", + " -0.097227\n", + " -0.132127\n", + " -0.097907\n", + " 0.252911\n", + " 0.251474\n", + " 0.337910\n", + " 0.296604\n", " \n", " \n", " 3\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 11\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 82\n", - " 2\n", - " 2\n", + " 1000\n", + " None\n", + " 0\n", " ...\n", - " 1.999033\n", - " 2.351937\n", - " 0.742802\n", - " 0.693465\n", - " 0.661268\n", - " 0.549678\n", - " -0.566042\n", - " -0.566538\n", - " -0.561955\n", - " -0.576103\n", + " 1.856420\n", + " 1.740204\n", + " 0.198178\n", + " 0.191218\n", + " 0.173410\n", + " 0.210927\n", + " -0.340967\n", + " -0.328878\n", + " -0.173260\n", + " -0.293306\n", " \n", " \n", " 4\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 14\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 83\n", - " 1\n", - " 1\n", + " 1000\n", + " None\n", + " 0\n", " ...\n", - " -0.643806\n", - " -0.653913\n", - " 0.143588\n", - " 0.159361\n", - " 0.118046\n", - " 0.150589\n", - " -0.702865\n", - " -0.701488\n", - " -0.705585\n", - " -0.702083\n", + " -1.399975\n", + " -1.412046\n", + " -0.597744\n", + " -0.601125\n", + " -0.628829\n", + " -0.621264\n", + " -0.491339\n", + " -0.487596\n", + " -0.440303\n", + " -0.472316\n", " \n", " \n", "\n", - "

5 rows × 1596 columns

\n", + "

5 rows × 1600 columns

\n", "" ], "text/plain": [ - " Metadata_WellRow Metadata_Well Metadata_number_of_singlecells \\\n", - "0 B B1 42 \n", - "1 B B1 42 \n", - "2 B B1 42 \n", - "3 B B1 42 \n", - "4 B B1 42 \n", - "\n", - " Metadata_WellCol Metadata_gene_name Metadata_genotype \\\n", - "0 1 NF1 WT \n", - "1 1 NF1 WT \n", - "2 1 NF1 WT \n", - "3 1 NF1 WT \n", - "4 1 NF1 WT \n", + " Metadata_WellRow Metadata_WellCol Metadata_Well Metadata_Site \n", + "0 B 2 B2 11 \\\n", + "1 B 2 B2 11 \n", + "2 B 2 B2 11 \n", + "3 B 2 B2 11 \n", + "4 B 2 B2 14 \n", "\n", - " Metadata_seed_density Metadata_ImageNumber \\\n", - "0 500 78 \n", - "1 500 81 \n", - "2 500 82 \n", - "3 500 82 \n", - "4 500 83 \n", + " Metadata_number_of_singlecells Metadata_gene_name Metadata_genotype \n", + "0 115 NF1 WT \\\n", + "1 115 NF1 WT \n", + "2 115 NF1 WT \n", + "3 115 NF1 WT \n", + "4 115 NF1 WT \n", "\n", - " Metadata_Cells_Number_Object_Number Metadata_Cytoplasm_Parent_Cells ... \\\n", - "0 1 1 ... \n", - "1 2 2 ... \n", - "2 1 1 ... \n", - "3 2 2 ... \n", - "4 1 1 ... \n", + " Metadata_seed_density Metadata_siRNA Metadata_RNAiMax ... \n", + "0 1000 None 0 ... \\\n", + "1 1000 None 0 ... \n", + "2 1000 None 0 ... \n", + "3 1000 None 0 ... \n", + "4 1000 None 0 ... \n", "\n", - " Nuclei_Texture_Variance_DAPI_3_02_256 \\\n", - "0 1.256030 \n", - "1 -0.649737 \n", - "2 1.482894 \n", - "3 1.999033 \n", - "4 -0.643806 \n", + " Nuclei_Texture_Variance_DAPI_3_02_256 \n", + "0 -1.015567 \\\n", + "1 1.254595 \n", + "2 -0.840056 \n", + "3 1.856420 \n", + "4 -1.399975 \n", "\n", - " Nuclei_Texture_Variance_DAPI_3_03_256 \\\n", - "0 1.227913 \n", - "1 -0.635740 \n", - "2 1.666617 \n", - "3 2.351937 \n", - "4 -0.653913 \n", + " Nuclei_Texture_Variance_DAPI_3_03_256 \n", + "0 -0.991013 \\\n", + "1 1.181604 \n", + "2 -0.882460 \n", + "3 1.740204 \n", + "4 -1.412046 \n", "\n", - " Nuclei_Texture_Variance_GFP_3_00_256 Nuclei_Texture_Variance_GFP_3_01_256 \\\n", - "0 0.531897 0.487729 \n", - "1 -0.267391 -0.243767 \n", - "2 0.783631 0.535143 \n", - "3 0.742802 0.693465 \n", - "4 0.143588 0.159361 \n", + " Nuclei_Texture_Variance_GFP_3_00_256 Nuclei_Texture_Variance_GFP_3_01_256 \n", + "0 -0.427085 -0.492375 \\\n", + "1 -0.467826 -0.486035 \n", + "2 -0.052123 -0.097227 \n", + "3 0.198178 0.191218 \n", + "4 -0.597744 -0.601125 \n", "\n", - " Nuclei_Texture_Variance_GFP_3_02_256 Nuclei_Texture_Variance_GFP_3_03_256 \\\n", - "0 0.498280 0.547909 \n", - "1 -0.262017 -0.255550 \n", - "2 0.474602 0.502886 \n", - "3 0.661268 0.549678 \n", - "4 0.118046 0.150589 \n", + " Nuclei_Texture_Variance_GFP_3_02_256 Nuclei_Texture_Variance_GFP_3_03_256 \n", + "0 -0.505686 -0.492483 \\\n", + "1 -0.481063 -0.457716 \n", + "2 -0.132127 -0.097907 \n", + "3 0.173410 0.210927 \n", + "4 -0.628829 -0.621264 \n", "\n", - " Nuclei_Texture_Variance_RFP_3_00_256 Nuclei_Texture_Variance_RFP_3_01_256 \\\n", - "0 -0.751284 -0.750040 \n", - "1 -0.500182 -0.494528 \n", - "2 -0.569328 -0.590753 \n", - "3 -0.566042 -0.566538 \n", - "4 -0.702865 -0.701488 \n", + " Nuclei_Texture_Variance_RFP_3_00_256 Nuclei_Texture_Variance_RFP_3_01_256 \n", + "0 3.287944 3.324396 \\\n", + "1 -0.661907 -0.668559 \n", + "2 0.252911 0.251474 \n", + "3 -0.340967 -0.328878 \n", + "4 -0.491339 -0.487596 \n", "\n", " Nuclei_Texture_Variance_RFP_3_02_256 Nuclei_Texture_Variance_RFP_3_03_256 \n", - "0 -0.746242 -0.742364 \n", - "1 -0.511217 -0.505461 \n", - "2 -0.602282 -0.590615 \n", - "3 -0.561955 -0.576103 \n", - "4 -0.705585 -0.702083 \n", + "0 3.234285 3.337852 \n", + "1 -0.634783 -0.644279 \n", + "2 0.337910 0.296604 \n", + "3 -0.173260 -0.293306 \n", + "4 -0.440303 -0.472316 \n", "\n", - "[5 rows x 1596 columns]" + "[5 rows x 1600 columns]" ] }, "execution_count": 4, diff --git a/3.processing_features/3.feature_select_sc.ipynb b/3.processing_features/3.feature_select_sc.ipynb index 80c0382..0046ba2 100644 --- a/3.processing_features/3.feature_select_sc.ipynb +++ b/3.processing_features/3.feature_select_sc.ipynb @@ -22,11 +22,9 @@ "metadata": {}, "outputs": [], "source": [ - "import sys\n", "import pathlib\n", - "import os\n", "import yaml\n", - "import json\n", + "import pprint\n", "\n", "import pandas as pd\n", "from pycytominer import feature_select\n", @@ -50,52 +48,46 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\n", - " \"Plate_1\": {\n", - " \"annotated_path\": \"data/annotated_data/Plate_1_sc.parquet\",\n", - " \"dest_path\": \"data/converted_data/Plate_1.parquet\",\n", - " \"normalized_path\": \"data/normalized_data/Plate_1_sc_norm.parquet\",\n", - " \"platemap_path\": \"../0.download_data/metadata/platemap_NF1_plate1.csv\",\n", - " \"source_path\": \"../2.cellprofiler_analysis/analysis_output/Plate_1.sqlite\"\n", - " },\n", - " \"Plate_2\": {\n", - " \"annotated_path\": \"data/annotated_data/Plate_2_sc.parquet\",\n", - " \"dest_path\": \"data/converted_data/Plate_2.parquet\",\n", - " \"normalized_path\": \"data/normalized_data/Plate_2_sc_norm.parquet\",\n", - " \"platemap_path\": \"../0.download_data/metadata/platemap_NF1_plate2.csv\",\n", - " \"source_path\": \"../2.cellprofiler_analysis/analysis_output/Plate_2.sqlite\"\n", - " },\n", - " \"Plate_3\": {\n", - " \"annotated_path\": \"data/annotated_data/Plate_3_sc.parquet\",\n", - " \"dest_path\": \"data/converted_data/Plate_3.parquet\",\n", - " \"normalized_path\": \"data/normalized_data/Plate_3_sc_norm.parquet\",\n", - " \"platemap_path\": \"../0.download_data/metadata/platemap_NF1_plate3.csv\",\n", - " \"source_path\": \"../2.cellprofiler_analysis/analysis_output/Plate_3.sqlite\"\n", - " },\n", - " \"Plate_3_prime\": {\n", - " \"annotated_path\": \"data/annotated_data/Plate_3_prime_sc.parquet\",\n", - " \"dest_path\": \"data/converted_data/Plate_3_prime.parquet\",\n", - " \"normalized_path\": \"data/normalized_data/Plate_3_prime_sc_norm.parquet\",\n", - " \"platemap_path\": \"../0.download_data/metadata/platemap_NF1_plate3.csv\",\n", - " \"source_path\": \"../2.cellprofiler_analysis/analysis_output/Plate_3_prime.sqlite\"\n", - " }\n", - "}\n" + "{'Plate_1': {'annotated_path': 'data/annotated_data/Plate_1_sc.parquet',\n", + " 'dest_path': 'data/converted_data/Plate_1.parquet',\n", + " 'normalized_path': 'data/normalized_data/Plate_1_sc_norm.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate1.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},\n", + " 'Plate_2': {'annotated_path': 'data/annotated_data/Plate_2_sc.parquet',\n", + " 'dest_path': 'data/converted_data/Plate_2.parquet',\n", + " 'normalized_path': 'data/normalized_data/Plate_2_sc_norm.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate2.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},\n", + " 'Plate_3': {'annotated_path': 'data/annotated_data/Plate_3_sc.parquet',\n", + " 'dest_path': 'data/converted_data/Plate_3.parquet',\n", + " 'normalized_path': 'data/normalized_data/Plate_3_sc_norm.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},\n", + " 'Plate_3_prime': {'annotated_path': 'data/annotated_data/Plate_3_prime_sc.parquet',\n", + " 'dest_path': 'data/converted_data/Plate_3_prime.parquet',\n", + " 'normalized_path': 'data/normalized_data/Plate_3_prime_sc_norm.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},\n", + " 'Plate_4': {'annotated_path': 'data/annotated_data/Plate_4_sc.parquet',\n", + " 'dest_path': 'data/converted_data/Plate_4.parquet',\n", + " 'normalized_path': 'data/normalized_data/Plate_4_sc_norm.parquet',\n", + " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate4.csv',\n", + " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite'}}\n" ] } ], "source": [ "# output directory for feature selected data\n", "output_dir = pathlib.Path(\"./data/feature_selected_data\")\n", - "# if directory if doesn't exist, will not raise error if it already exists\n", - "os.makedirs(output_dir, exist_ok=True)\n", + "output_dir.mkdir(exist_ok=True)\n", "\n", "# load in dicionary from yaml file\n", "dictionary_path = pathlib.Path(\"./plate_info_dictionary.yaml\")\n", "with open(dictionary_path) as file:\n", " plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)\n", "\n", - "# view the dictionary to confirm all info is included to use for normalization\n", - "print(json.dumps(plate_info_dictionary, indent=4))" + "# view the dictionary to assess that all info is added correctly\n", + "pprint.pprint(plate_info_dictionary)" ] }, { @@ -133,7 +125,9 @@ "Performing feature selection on normalized annotated merged single cells for Plate_3!\n", "Features have been selected for Plate_3 and saved!\n", "Performing feature selection on normalized annotated merged single cells for Plate_3_prime!\n", - "Features have been selected for Plate_3_prime and saved!\n" + "Features have been selected for Plate_3_prime and saved!\n", + "Performing feature selection on normalized annotated merged single cells for Plate_4!\n", + "Features have been selected for Plate_4 and saved!\n" ] } ], @@ -148,7 +142,7 @@ "# process each run\n", "for plate, info in plate_info_dictionary.items():\n", " normalized_df = pd.read_parquet(info[\"normalized_path\"])\n", - " # output_file does not need to be saved to dictionary as there are no more processin steps after this\n", + " # output_file does not need to be saved to dictionary as there are no more processing steps after this\n", " output_file = str(pathlib.Path(f\"{output_dir}/{plate}_sc_norm_fs.parquet\"))\n", " print(f\"Performing feature selection on normalized annotated merged single cells for {plate}!\")\n", "\n", @@ -177,7 +171,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(14495, 595)\n" + "(7502, 635)\n" ] }, { @@ -202,23 +196,23 @@ " \n", " \n", " Metadata_WellRow\n", + " Metadata_WellCol\n", " Metadata_Well\n", + " Metadata_Site\n", " Metadata_number_of_singlecells\n", - " Metadata_WellCol\n", " Metadata_gene_name\n", " Metadata_genotype\n", " Metadata_seed_density\n", - " Metadata_ImageNumber\n", - " Metadata_Cells_Number_Object_Number\n", - " Metadata_Cytoplasm_Parent_Cells\n", + " Metadata_siRNA\n", + " Metadata_RNAiMax\n", " ...\n", - " Nuclei_Texture_InverseDifferenceMoment_GFP_3_01_256\n", - " Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256\n", " Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256\n", " Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256\n", " Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256\n", - " Nuclei_Texture_SumEntropy_DAPI_3_01_256\n", + " Nuclei_Texture_SumEntropy_CY5_3_00_256\n", + " Nuclei_Texture_SumEntropy_DAPI_3_03_256\n", " Nuclei_Texture_SumEntropy_RFP_3_00_256\n", + " Nuclei_Texture_SumVariance_CY5_3_01_256\n", " Nuclei_Texture_SumVariance_DAPI_3_01_256\n", " Nuclei_Texture_SumVariance_GFP_3_03_256\n", " Nuclei_Texture_SumVariance_RFP_3_01_256\n", @@ -228,228 +222,221 @@ " \n", " 0\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 11\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 78\n", - " 1\n", - " 1\n", + " 1000\n", + " None\n", + " 0\n", " ...\n", - " -0.191334\n", - " 0.671420\n", - " 0.936760\n", - " 1.458638\n", - " 0.948377\n", - " 1.107680\n", - " -1.864463\n", - " 1.259309\n", - " 0.415479\n", - " -0.697022\n", + " -0.853646\n", + " -0.593777\n", + " -0.287816\n", + " -0.307117\n", + " 0.078565\n", + " 2.150259\n", + " -0.548795\n", + " -1.056228\n", + " -0.473540\n", + " 2.716088\n", " \n", " \n", " 1\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 11\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 81\n", - " 2\n", - " 2\n", + " 1000\n", + " None\n", + " 0\n", " ...\n", - " -0.524947\n", - " 0.637803\n", - " 0.186882\n", - " -0.152737\n", - " -0.279862\n", - " 0.115990\n", - " -0.425139\n", - " -0.595902\n", - " -0.245817\n", - " -0.465088\n", + " -0.056489\n", + " 0.159731\n", + " -0.053930\n", + " -0.321569\n", + " 0.779409\n", + " -0.907817\n", + " -0.562408\n", + " 1.374649\n", + " -0.434801\n", + " -0.678851\n", " \n", " \n", " 2\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 11\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 82\n", - " 1\n", - " 1\n", + " 1000\n", + " None\n", + " 0\n", " ...\n", - " -0.230874\n", - " 0.104657\n", - " 0.039143\n", - " 0.272484\n", - " 1.296262\n", - " 0.984872\n", - " -0.957025\n", - " 1.404316\n", - " 0.494810\n", - " -0.577513\n", + " -0.629045\n", + " -0.341210\n", + " -0.587309\n", + " 0.342038\n", + " 0.153377\n", + " 0.525008\n", + " -0.068374\n", + " -1.011699\n", + " -0.132183\n", + " 0.077051\n", " \n", " \n", " 3\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 11\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 82\n", - " 2\n", - " 2\n", + " 1000\n", + " None\n", + " 0\n", " ...\n", - " -0.314377\n", - " 0.665691\n", - " 0.216409\n", - " 0.121547\n", - " 1.337763\n", - " 1.116384\n", - " -0.960571\n", - " 1.663014\n", - " 0.503990\n", - " -0.547628\n", + " -0.201772\n", + " 0.572489\n", + " -0.088943\n", + " -0.149306\n", + " 0.345108\n", + " -0.546746\n", + " -0.400875\n", + " 1.902985\n", + " 0.079753\n", + " -0.402336\n", " \n", " \n", " 4\n", " B\n", - " B1\n", - " 42\n", - " 1\n", + " 2\n", + " B2\n", + " 14\n", + " 115\n", " NF1\n", " WT\n", - " 500\n", - " 83\n", - " 1\n", - " 1\n", + " 1000\n", + " None\n", + " 0\n", " ...\n", - " -0.589706\n", - " 0.528876\n", - " 0.854686\n", - " 1.324683\n", - " 0.926693\n", - " 0.043019\n", - " -1.360293\n", - " -0.635271\n", - " 0.100997\n", - " -0.639121\n", + " -0.379375\n", + " 0.724884\n", + " -0.371386\n", + " -0.954809\n", + " -0.247454\n", + " -0.395320\n", + " -0.713150\n", + " -1.447722\n", + " -0.577792\n", + " -0.543383\n", " \n", " \n", "\n", - "

5 rows × 595 columns

\n", + "

5 rows × 635 columns

\n", "" ], "text/plain": [ - " Metadata_WellRow Metadata_Well Metadata_number_of_singlecells \\\n", - "0 B B1 42 \n", - "1 B B1 42 \n", - "2 B B1 42 \n", - "3 B B1 42 \n", - "4 B B1 42 \n", - "\n", - " Metadata_WellCol Metadata_gene_name Metadata_genotype \\\n", - "0 1 NF1 WT \n", - "1 1 NF1 WT \n", - "2 1 NF1 WT \n", - "3 1 NF1 WT \n", - "4 1 NF1 WT \n", + " Metadata_WellRow Metadata_WellCol Metadata_Well Metadata_Site \n", + "0 B 2 B2 11 \\\n", + "1 B 2 B2 11 \n", + "2 B 2 B2 11 \n", + "3 B 2 B2 11 \n", + "4 B 2 B2 14 \n", "\n", - " Metadata_seed_density Metadata_ImageNumber \\\n", - "0 500 78 \n", - "1 500 81 \n", - "2 500 82 \n", - "3 500 82 \n", - "4 500 83 \n", + " Metadata_number_of_singlecells Metadata_gene_name Metadata_genotype \n", + "0 115 NF1 WT \\\n", + "1 115 NF1 WT \n", + "2 115 NF1 WT \n", + "3 115 NF1 WT \n", + "4 115 NF1 WT \n", "\n", - " Metadata_Cells_Number_Object_Number Metadata_Cytoplasm_Parent_Cells ... \\\n", - "0 1 1 ... \n", - "1 2 2 ... \n", - "2 1 1 ... \n", - "3 2 2 ... \n", - "4 1 1 ... \n", + " Metadata_seed_density Metadata_siRNA Metadata_RNAiMax ... \n", + "0 1000 None 0 ... \\\n", + "1 1000 None 0 ... \n", + "2 1000 None 0 ... \n", + "3 1000 None 0 ... \n", + "4 1000 None 0 ... \n", "\n", - " Nuclei_Texture_InverseDifferenceMoment_GFP_3_01_256 \\\n", - "0 -0.191334 \n", - "1 -0.524947 \n", - "2 -0.230874 \n", - "3 -0.314377 \n", - "4 -0.589706 \n", + " Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256 \n", + "0 -0.853646 \\\n", + "1 -0.056489 \n", + "2 -0.629045 \n", + "3 -0.201772 \n", + "4 -0.379375 \n", "\n", - " Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256 \\\n", - "0 0.671420 \n", - "1 0.637803 \n", - "2 0.104657 \n", - "3 0.665691 \n", - "4 0.528876 \n", + " Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256 \n", + "0 -0.593777 \\\n", + "1 0.159731 \n", + "2 -0.341210 \n", + "3 0.572489 \n", + "4 0.724884 \n", "\n", - " Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256 \\\n", - "0 0.936760 \n", - "1 0.186882 \n", - "2 0.039143 \n", - "3 0.216409 \n", - "4 0.854686 \n", + " Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256 \n", + "0 -0.287816 \\\n", + "1 -0.053930 \n", + "2 -0.587309 \n", + "3 -0.088943 \n", + "4 -0.371386 \n", "\n", - " Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256 \\\n", - "0 1.458638 \n", - "1 -0.152737 \n", - "2 0.272484 \n", - "3 0.121547 \n", - "4 1.324683 \n", + " Nuclei_Texture_SumEntropy_CY5_3_00_256 \n", + "0 -0.307117 \\\n", + "1 -0.321569 \n", + "2 0.342038 \n", + "3 -0.149306 \n", + "4 -0.954809 \n", "\n", - " Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256 \\\n", - "0 0.948377 \n", - "1 -0.279862 \n", - "2 1.296262 \n", - "3 1.337763 \n", - "4 0.926693 \n", + " Nuclei_Texture_SumEntropy_DAPI_3_03_256 \n", + "0 0.078565 \\\n", + "1 0.779409 \n", + "2 0.153377 \n", + "3 0.345108 \n", + "4 -0.247454 \n", "\n", - " Nuclei_Texture_SumEntropy_DAPI_3_01_256 \\\n", - "0 1.107680 \n", - "1 0.115990 \n", - "2 0.984872 \n", - "3 1.116384 \n", - "4 0.043019 \n", + " Nuclei_Texture_SumEntropy_RFP_3_00_256 \n", + "0 2.150259 \\\n", + "1 -0.907817 \n", + "2 0.525008 \n", + "3 -0.546746 \n", + "4 -0.395320 \n", "\n", - " Nuclei_Texture_SumEntropy_RFP_3_00_256 \\\n", - "0 -1.864463 \n", - "1 -0.425139 \n", - "2 -0.957025 \n", - "3 -0.960571 \n", - "4 -1.360293 \n", + " Nuclei_Texture_SumVariance_CY5_3_01_256 \n", + "0 -0.548795 \\\n", + "1 -0.562408 \n", + "2 -0.068374 \n", + "3 -0.400875 \n", + "4 -0.713150 \n", "\n", - " Nuclei_Texture_SumVariance_DAPI_3_01_256 \\\n", - "0 1.259309 \n", - "1 -0.595902 \n", - "2 1.404316 \n", - "3 1.663014 \n", - "4 -0.635271 \n", + " Nuclei_Texture_SumVariance_DAPI_3_01_256 \n", + "0 -1.056228 \\\n", + "1 1.374649 \n", + "2 -1.011699 \n", + "3 1.902985 \n", + "4 -1.447722 \n", "\n", - " Nuclei_Texture_SumVariance_GFP_3_03_256 \\\n", - "0 0.415479 \n", - "1 -0.245817 \n", - "2 0.494810 \n", - "3 0.503990 \n", - "4 0.100997 \n", + " Nuclei_Texture_SumVariance_GFP_3_03_256 \n", + "0 -0.473540 \\\n", + "1 -0.434801 \n", + "2 -0.132183 \n", + "3 0.079753 \n", + "4 -0.577792 \n", "\n", " Nuclei_Texture_SumVariance_RFP_3_01_256 \n", - "0 -0.697022 \n", - "1 -0.465088 \n", - "2 -0.577513 \n", - "3 -0.547628 \n", - "4 -0.639121 \n", + "0 2.716088 \n", + "1 -0.678851 \n", + "2 0.077051 \n", + "3 -0.402336 \n", + "4 -0.543383 \n", "\n", - "[5 rows x 595 columns]" + "[5 rows x 635 columns]" ] }, "execution_count": 4, diff --git a/3.processing_features/scripts/0.merge_sc_cytotable.py b/3.processing_features/scripts/0.merge_sc_cytotable.py index 7a4f01d..b3d3be0 100644 --- a/3.processing_features/scripts/0.merge_sc_cytotable.py +++ b/3.processing_features/scripts/0.merge_sc_cytotable.py @@ -11,6 +11,8 @@ import sys import pathlib import yaml +import pprint +import pandas as pd # cytotable will merge objects from SQLite file into single cells and save as parquet file from cytotable import convert, presets @@ -27,6 +29,20 @@ # type of file output from CytoTable (currently only parquet) dest_datatype = "parquet" +# set main output dir for all parquet files +output_dir = pathlib.Path("./data/converted_data/") +output_dir.mkdir(exist_ok=True) + +# directory where SQLite files are located +sqlite_dir = pathlib.Path("../2.cellprofiler_analysis/analysis_output/") + +# list for plate names based on folders to use to create dictionary +plate_names = [] +# iterate through 0.download_data and append plate names from folder names that contain image data from that plate +for file_path in pathlib.Path("../0.download_data/").iterdir(): + if str(file_path.stem).startswith("Plate"): + plate_names.append(str(file_path.stem)) + # preset configurations based on typical CellProfiler outputs preset = "cellprofiler_sqlite_pycytominer" # remove Image_Metadata_Plate from SELECT as this metadata was not extracted from file names @@ -59,52 +75,27 @@ """ +# ## Create dictionary with info for each plate +# +# **Note:** All paths must be string to use with CytoTable. + # In[3]: -# dictionary with info for the sqlite file from each plate +# create plate info dictionary with all parts of the CellProfiler CLI command to run in parallel plate_info_dictionary = { - "Plate_1": { - # path to outputted SQLite file - "source_path": str( - pathlib.Path( - "../2.cellprofiler_analysis/analysis_output/Plate_1.sqlite" - ) - ), - "dest_path": str(pathlib.Path(f"./data/converted_data/Plate_1.parquet")), - }, - "Plate_2": { - # path to outputted SQLite file - "source_path": str( - pathlib.Path( - "../2.cellprofiler_analysis/analysis_output/Plate_2.sqlite" - ) - ), - # path for merged single cell paraquet file (without annotations) - "dest_path": str(pathlib.Path(f"./data/converted_data/Plate_2.parquet")), - }, - "Plate_3": { - # path to outputted SQLite file - "source_path": str( - pathlib.Path( - "../2.cellprofiler_analysis/analysis_output/Plate_3.sqlite" - ) - ), - # path for merged single cell paraquet file (without annotations) - "dest_path": str(pathlib.Path(f"./data/converted_data/Plate_3.parquet")), - }, - "Plate_3_prime": { - # path to outputted SQLite file - "source_path": str( - pathlib.Path( - "../2.cellprofiler_analysis/analysis_output/Plate_3_prime.sqlite" - ) - ), - # path for merged single cell paraquet file (without annotations) - "dest_path": str(pathlib.Path(f"./data/converted_data/Plate_3_prime.parquet")), + name: { + "source_path": str(pathlib.Path( + list(sqlite_dir.rglob(f"{name}_nf1_analysis.sqlite"))[0] + ).resolve(strict=True)), + "dest_path": str(pathlib.Path(f"{output_dir}/{name}.parquet")), } + for name in plate_names } +# view the dictionary to assess that all info is added correctly +pprint.pprint(plate_info_dictionary, indent=4) + # ## Merge objects to single cells and convert SQLite to parquet file + add single cell metadata @@ -133,9 +124,19 @@ print(f"Added single cell count as metadata to {pathlib.Path(dest_path).name}!") +# In[5]: + + +converted_df = pd.read_parquet(plate_info_dictionary["Plate_4"]["dest_path"]) + +# load in and print a converted df to see if it looks correct +print(converted_df.shape) +converted_df.head() + + # ## Write dictionary to yaml file for use in downstream steps -# In[5]: +# In[6]: dictionary_path = pathlib.Path("./plate_info_dictionary.yaml") diff --git a/3.processing_features/scripts/1.annotate_sc.py b/3.processing_features/scripts/1.annotate_sc.py index d50f644..aae8d5b 100644 --- a/3.processing_features/scripts/1.annotate_sc.py +++ b/3.processing_features/scripts/1.annotate_sc.py @@ -10,9 +10,8 @@ import sys import pathlib -import os import yaml -import json +import pprint import pandas as pd from pycytominer import annotate @@ -29,28 +28,45 @@ # output directory for annotated data output_dir = pathlib.Path("./data/annotated_data") -# if directory if doesn't exist, will not raise error if it already exists -os.makedirs(output_dir, exist_ok=True) +output_dir.mkdir(exist_ok=True) + +# directory with metadata +metadata_dir = pathlib.Path("../0.download_data/metadata/") # load in dicionary from yaml file dictionary_path = pathlib.Path("./plate_info_dictionary.yaml") with open(dictionary_path) as file: plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader) -# add paths to dictionary that are used for annotation -plate_info_dictionary["Plate_1"]["platemap_path"] = str(pathlib.Path("../0.download_data/metadata/platemap_NF1_plate1.csv")) -plate_info_dictionary["Plate_2"]["platemap_path"] = str(pathlib.Path("../0.download_data/metadata/platemap_NF1_plate2.csv")) -# both plates 3 and 3 prime use the same platemap file (same metadata) -plate_info_dictionary["Plate_3"]["platemap_path"] = str(pathlib.Path("../0.download_data/metadata/platemap_NF1_plate3.csv")) -plate_info_dictionary["Plate_3_prime"]["platemap_path"] = str(pathlib.Path("../0.download_data/metadata/platemap_NF1_plate3.csv")) + +# ## Add metadata paths to loaded in dictionary + +# In[3]: + + +# add path to platemaps for each plate +for plate, _ in plate_info_dictionary.items(): + # since Plate_3_prime has the same platemap as Plate_3, we need an else statement so that we make sure it adds the + # path that was given to Plate_3 + if plate != "Plate_3_prime": + # match the naming format of the plates to the platemap file + plate_info_dictionary[plate]["platemap_path"] = str( + pathlib.Path(list(metadata_dir.rglob(f"platemap_NF1_{plate.replace('_', '').lower()}.csv"))[0]).resolve( + strict=True + ) + ) + else: + plate_info_dictionary["Plate_3_prime"]["platemap_path"] = plate_info_dictionary["Plate_3"]["platemap_path"] # view the dictionary to assess that all info is added correctly -print(json.dumps(plate_info_dictionary, indent=4)) +pprint.pprint(plate_info_dictionary, indent=4) # ## Annotate merged single cells +# +# **Note:** The path to the annotated file to be used for normalization is adding during this step. -# In[3]: +# In[4]: for plate, info in plate_info_dictionary.items(): @@ -69,12 +85,18 @@ join_on=["Metadata_well_position", "Image_Metadata_Well"], ) - # move metadata well and single cell count to the front of the df (for easy visualization in python) + # rename site column to avoid any issues with identifying the column as metadata over feature + annotated_df = annotated_df.rename(columns={"Image_Metadata_Site": "Metadata_Site"}) + + # move metadata well, single cell count, and site to the front of the df (for easy visualization in python) well_column = annotated_df.pop("Metadata_Well") singlecell_column = annotated_df.pop("Metadata_number_of_singlecells") - # insert the column as the second index column in the dataframe - annotated_df.insert(1, "Metadata_Well", well_column) - annotated_df.insert(2, "Metadata_number_of_singlecells", singlecell_column) + site_column = annotated_df.pop("Metadata_Site") + + # insert the columns in specific parts of the dataframe + annotated_df.insert(2, "Metadata_Well", well_column) + annotated_df.insert(3, "Metadata_Site", site_column) + annotated_df.insert(4, "Metadata_number_of_singlecells", singlecell_column) # save annotated df as parquet file output( @@ -85,7 +107,7 @@ print(f"Annotations have been added to {plate} and saved!") -# In[4]: +# In[5]: # print last annotated df to see if annotation occurred @@ -95,9 +117,9 @@ # ## Write updated dictionary to yaml file for use in downstream steps -# In[5]: +# In[6]: -with open(dictionary_path, 'w') as file: +with open(dictionary_path, "w") as file: yaml.dump(plate_info_dictionary, file) diff --git a/3.processing_features/scripts/2.normalize_sc.py b/3.processing_features/scripts/2.normalize_sc.py index deeb1c0..2e84ad6 100644 --- a/3.processing_features/scripts/2.normalize_sc.py +++ b/3.processing_features/scripts/2.normalize_sc.py @@ -8,33 +8,36 @@ # In[1]: -import sys import pathlib -import os import yaml -import json +import pprint import pandas as pd from pycytominer import normalize from pycytominer.cyto_utils import output +# ## Set paths and load in dictionary from annotated run + # In[2]: # output directory for normalized data output_dir = pathlib.Path("./data/normalized_data") -# if directory if doesn't exist, will not raise error if it already exists -os.makedirs(output_dir, exist_ok=True) +output_dir.mkdir(exist_ok=True) # load in dicionary from yaml file dictionary_path = pathlib.Path("./plate_info_dictionary.yaml") with open(dictionary_path) as file: plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader) -# view the dictionary to confirm all info is included to use for normalization -print(json.dumps(plate_info_dictionary, indent=4)) +# view the dictionary to assess that all info is added correctly +pprint.pprint(plate_info_dictionary) + +# ## Normalize annotated single cells from each plate +# +# **Note:** Path to normalized data for each plate is added to the dictionary in this step to be used during feature selection. # In[3]: @@ -42,8 +45,9 @@ # process each run for plate, info in plate_info_dictionary.items(): annotated_df = pd.read_parquet(info["annotated_path"]) + # set output path and add to the dictionary output_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_norm.parquet")) - # save path to annotated file to dictionary for downstream use + # save path to normalized file to dictionary for downstream use plate_info_dictionary[plate]["normalized_path"] = output_file print(f"Normalizing annotated merged single cells for {plate}!") diff --git a/3.processing_features/scripts/3.feature_select_sc.py b/3.processing_features/scripts/3.feature_select_sc.py index 6cbb179..beb12f4 100644 --- a/3.processing_features/scripts/3.feature_select_sc.py +++ b/3.processing_features/scripts/3.feature_select_sc.py @@ -8,11 +8,9 @@ # In[1]: -import sys import pathlib -import os import yaml -import json +import pprint import pandas as pd from pycytominer import feature_select @@ -26,16 +24,15 @@ # output directory for feature selected data output_dir = pathlib.Path("./data/feature_selected_data") -# if directory if doesn't exist, will not raise error if it already exists -os.makedirs(output_dir, exist_ok=True) +output_dir.mkdir(exist_ok=True) # load in dicionary from yaml file dictionary_path = pathlib.Path("./plate_info_dictionary.yaml") with open(dictionary_path) as file: plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader) -# view the dictionary to confirm all info is included to use for normalization -print(json.dumps(plate_info_dictionary, indent=4)) +# view the dictionary to assess that all info is added correctly +pprint.pprint(plate_info_dictionary) # ## Perform feature selection @@ -64,7 +61,7 @@ # process each run for plate, info in plate_info_dictionary.items(): normalized_df = pd.read_parquet(info["normalized_path"]) - # output_file does not need to be saved to dictionary as there are no more processin steps after this + # output_file does not need to be saved to dictionary as there are no more processing steps after this output_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_norm_fs.parquet")) print(f"Performing feature selection on normalized annotated merged single cells for {plate}!") From 102d405525b3f7e8881b1638fde2bb75181412d3 Mon Sep 17 00:00:00 2001 From: jenna-tomkinson Date: Mon, 17 Jul 2023 13:15:26 -0600 Subject: [PATCH 3/5] add edited dict file --- .../plate_info_dictionary.yaml | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/3.processing_features/plate_info_dictionary.yaml b/3.processing_features/plate_info_dictionary.yaml index 7e588da..293e8c8 100644 --- a/3.processing_features/plate_info_dictionary.yaml +++ b/3.processing_features/plate_info_dictionary.yaml @@ -2,23 +2,29 @@ Plate_1: annotated_path: data/annotated_data/Plate_1_sc.parquet dest_path: data/converted_data/Plate_1.parquet normalized_path: data/normalized_data/Plate_1_sc_norm.parquet - platemap_path: ../0.download_data/metadata/platemap_NF1_plate1.csv - source_path: ../2.cellprofiler_analysis/analysis_output/Plate_1.sqlite + platemap_path: /home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate1.csv + source_path: /home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite Plate_2: annotated_path: data/annotated_data/Plate_2_sc.parquet dest_path: data/converted_data/Plate_2.parquet normalized_path: data/normalized_data/Plate_2_sc_norm.parquet - platemap_path: ../0.download_data/metadata/platemap_NF1_plate2.csv - source_path: ../2.cellprofiler_analysis/analysis_output/Plate_2.sqlite + platemap_path: /home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate2.csv + source_path: /home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite Plate_3: annotated_path: data/annotated_data/Plate_3_sc.parquet dest_path: data/converted_data/Plate_3.parquet normalized_path: data/normalized_data/Plate_3_sc_norm.parquet - platemap_path: ../0.download_data/metadata/platemap_NF1_plate3.csv - source_path: ../2.cellprofiler_analysis/analysis_output/Plate_3.sqlite + platemap_path: /home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv + source_path: /home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite Plate_3_prime: annotated_path: data/annotated_data/Plate_3_prime_sc.parquet dest_path: data/converted_data/Plate_3_prime.parquet normalized_path: data/normalized_data/Plate_3_prime_sc_norm.parquet - platemap_path: ../0.download_data/metadata/platemap_NF1_plate3.csv - source_path: ../2.cellprofiler_analysis/analysis_output/Plate_3_prime.sqlite + platemap_path: /home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv + source_path: /home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite +Plate_4: + annotated_path: data/annotated_data/Plate_4_sc.parquet + dest_path: data/converted_data/Plate_4.parquet + normalized_path: data/normalized_data/Plate_4_sc_norm.parquet + platemap_path: /home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate4.csv + source_path: /home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite From 14a5373f6142de6bfadb1b42ffe8aaf32d13d2d3 Mon Sep 17 00:00:00 2001 From: jenna-tomkinson Date: Mon, 17 Jul 2023 13:15:49 -0600 Subject: [PATCH 4/5] add plate 4 data + fixed site issue in other plates --- 3.processing_features/data/annotated_data/Plate_1_sc.parquet | 4 ++-- 3.processing_features/data/annotated_data/Plate_2_sc.parquet | 4 ++-- .../data/annotated_data/Plate_3_prime_sc.parquet | 4 ++-- 3.processing_features/data/annotated_data/Plate_3_sc.parquet | 4 ++-- 3.processing_features/data/annotated_data/Plate_4_sc.parquet | 3 +++ 3.processing_features/data/converted_data/Plate_4.parquet | 3 +++ .../data/feature_selected_data/Plate_1_sc_norm_fs.parquet | 4 ++-- .../data/feature_selected_data/Plate_2_sc_norm_fs.parquet | 4 ++-- .../feature_selected_data/Plate_3_prime_sc_norm_fs.parquet | 4 ++-- .../data/feature_selected_data/Plate_3_sc_norm_fs.parquet | 4 ++-- .../data/feature_selected_data/Plate_4_sc_norm_fs.parquet | 3 +++ .../data/normalized_data/Plate_1_sc_norm.parquet | 4 ++-- .../data/normalized_data/Plate_2_sc_norm.parquet | 4 ++-- .../data/normalized_data/Plate_3_prime_sc_norm.parquet | 4 ++-- .../data/normalized_data/Plate_3_sc_norm.parquet | 4 ++-- .../data/normalized_data/Plate_4_sc_norm.parquet | 3 +++ 16 files changed, 36 insertions(+), 24 deletions(-) create mode 100644 3.processing_features/data/annotated_data/Plate_4_sc.parquet create mode 100644 3.processing_features/data/converted_data/Plate_4.parquet create mode 100644 3.processing_features/data/feature_selected_data/Plate_4_sc_norm_fs.parquet create mode 100644 3.processing_features/data/normalized_data/Plate_4_sc_norm.parquet diff --git a/3.processing_features/data/annotated_data/Plate_1_sc.parquet b/3.processing_features/data/annotated_data/Plate_1_sc.parquet index 82eea4a..01bb5dc 100644 --- a/3.processing_features/data/annotated_data/Plate_1_sc.parquet +++ b/3.processing_features/data/annotated_data/Plate_1_sc.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4343be5fb854808b475581be2fc5b51aa01880869ef0bca70bd185f471203658 -size 3482348 +oid sha256:da6316ff816154fc5c219327ceca23f3d74a728767781e2ebd1ec8dc342953ef +size 3482298 diff --git a/3.processing_features/data/annotated_data/Plate_2_sc.parquet b/3.processing_features/data/annotated_data/Plate_2_sc.parquet index de4752b..0a973ab 100644 --- a/3.processing_features/data/annotated_data/Plate_2_sc.parquet +++ b/3.processing_features/data/annotated_data/Plate_2_sc.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9887bbc6f7f5b013e4af1932b6d35575d31e9cc1f57877f7d44c7126c8baf245 -size 18283127 +oid sha256:af067a202f21aaa7135e0b7b0920582d253cf8d78e96408a9a9afb1169378254 +size 18283075 diff --git a/3.processing_features/data/annotated_data/Plate_3_prime_sc.parquet b/3.processing_features/data/annotated_data/Plate_3_prime_sc.parquet index 0796026..77e2bb6 100644 --- a/3.processing_features/data/annotated_data/Plate_3_prime_sc.parquet +++ b/3.processing_features/data/annotated_data/Plate_3_prime_sc.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c2c16aa8947d3f51613cbe168f07d14ee841e2914973d00bde612e1582ab77f -size 200257737 +oid sha256:d3dc35dbb6a6e8fe73cf4d74b411043641129462bbb2569e9fa5a4210b4b994d +size 200257701 diff --git a/3.processing_features/data/annotated_data/Plate_3_sc.parquet b/3.processing_features/data/annotated_data/Plate_3_sc.parquet index 189e403..ecc0484 100644 --- a/3.processing_features/data/annotated_data/Plate_3_sc.parquet +++ b/3.processing_features/data/annotated_data/Plate_3_sc.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3423f8a3f437b5b607d5af9a667d4111f00971fcebaee9ddcf4be63d9d36281e -size 257764059 +oid sha256:9d668b2865d25a24376915079a6d1c2378124ec37f52035a337310137935c888 +size 257764027 diff --git a/3.processing_features/data/annotated_data/Plate_4_sc.parquet b/3.processing_features/data/annotated_data/Plate_4_sc.parquet new file mode 100644 index 0000000..7f6b669 --- /dev/null +++ b/3.processing_features/data/annotated_data/Plate_4_sc.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa5d6df41d4bf761763a31573cfc78977dcd3b76f52f449f9b8d807f3916b279 +size 103118339 diff --git a/3.processing_features/data/converted_data/Plate_4.parquet b/3.processing_features/data/converted_data/Plate_4.parquet new file mode 100644 index 0000000..bd61d84 --- /dev/null +++ b/3.processing_features/data/converted_data/Plate_4.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ddacc0323239f65b4978d5739d7488dd18dd8dbb0f7679e77155ec48fbe8285 +size 103106330 diff --git a/3.processing_features/data/feature_selected_data/Plate_1_sc_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_1_sc_norm_fs.parquet index ed792bd..b170e34 100644 --- a/3.processing_features/data/feature_selected_data/Plate_1_sc_norm_fs.parquet +++ b/3.processing_features/data/feature_selected_data/Plate_1_sc_norm_fs.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d409096648d1abb272f8206d762ca8e2d9ec3234454ef175d19128d9fe847989 -size 1416640 +oid sha256:f0185fa6b4a5b6b08d60dc2bea4a46be220d0288b2428b5dbf564e7752e3ef14 +size 1417303 diff --git a/3.processing_features/data/feature_selected_data/Plate_2_sc_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_2_sc_norm_fs.parquet index a9b39eb..794db40 100644 --- a/3.processing_features/data/feature_selected_data/Plate_2_sc_norm_fs.parquet +++ b/3.processing_features/data/feature_selected_data/Plate_2_sc_norm_fs.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d72cb358279f2231081b432061915ac9598b4293bb4c085a7188a0afbde338c1 -size 7443114 +oid sha256:ef4a63b8a75e84b48d7a8a03f4a67d9b5e2018a973acae0b6fdb1779a8b06650 +size 7444104 diff --git a/3.processing_features/data/feature_selected_data/Plate_3_prime_sc_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_3_prime_sc_norm_fs.parquet index 92ca923..b141ce8 100644 --- a/3.processing_features/data/feature_selected_data/Plate_3_prime_sc_norm_fs.parquet +++ b/3.processing_features/data/feature_selected_data/Plate_3_prime_sc_norm_fs.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc95019a3553a48e8ce2c2f3c80ed87f4a1c9c75e2a539731911306efcb56e77 -size 80686014 +oid sha256:6bd7cd784e232e9b7cc0a84b921919bc8b584c4dcb8ff19ae0ff9c6def3820fc +size 80693358 diff --git a/3.processing_features/data/feature_selected_data/Plate_3_sc_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_3_sc_norm_fs.parquet index 8137344..08d2fc3 100644 --- a/3.processing_features/data/feature_selected_data/Plate_3_sc_norm_fs.parquet +++ b/3.processing_features/data/feature_selected_data/Plate_3_sc_norm_fs.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:549cdde84464d296866312b4da898541e17f94eb441a975b79896d43cd010483 -size 107322510 +oid sha256:fd509b89278e45c23edb58b1a1ca6588d6f04e01094d11fc130f5ef1cf2b8db6 +size 107330217 diff --git a/3.processing_features/data/feature_selected_data/Plate_4_sc_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_4_sc_norm_fs.parquet new file mode 100644 index 0000000..01230aa --- /dev/null +++ b/3.processing_features/data/feature_selected_data/Plate_4_sc_norm_fs.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0f1b563b31cc8e2116acdc52b30da36849d6d59ad7e97b35e81b3ca7c2f29b5 +size 44386548 diff --git a/3.processing_features/data/normalized_data/Plate_1_sc_norm.parquet b/3.processing_features/data/normalized_data/Plate_1_sc_norm.parquet index 7737793..7c6e5c2 100644 --- a/3.processing_features/data/normalized_data/Plate_1_sc_norm.parquet +++ b/3.processing_features/data/normalized_data/Plate_1_sc_norm.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05987755f074a23e93a35d04b2096fbea0c55609ee3c47c365bd20b37e26042b -size 3544984 +oid sha256:38b6ef063a7c5caf4e4454e81980ce32884965d3fbe7553a3aa7e37054c5b8e0 +size 3545645 diff --git a/3.processing_features/data/normalized_data/Plate_2_sc_norm.parquet b/3.processing_features/data/normalized_data/Plate_2_sc_norm.parquet index 286a917..8c41bbb 100644 --- a/3.processing_features/data/normalized_data/Plate_2_sc_norm.parquet +++ b/3.processing_features/data/normalized_data/Plate_2_sc_norm.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75a167f58dd809ec35eb8ff28e548b5a3752f2a6add031f04f5cb2d386c5a8fa -size 18644538 +oid sha256:bd8587d6168f9a922b23a0f6a166f89929775825ad77de945cd0a6d614a5da67 +size 18645528 diff --git a/3.processing_features/data/normalized_data/Plate_3_prime_sc_norm.parquet b/3.processing_features/data/normalized_data/Plate_3_prime_sc_norm.parquet index ae03f26..f8b21c7 100644 --- a/3.processing_features/data/normalized_data/Plate_3_prime_sc_norm.parquet +++ b/3.processing_features/data/normalized_data/Plate_3_prime_sc_norm.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9217a0155f56ca7f5951d635d9136575653373fa30afd7d667b7b1986c7b80b5 -size 202677906 +oid sha256:bc21d315ef7ad50c587720412114fea98605ff474babb2ddb446a9740c5783fb +size 202685258 diff --git a/3.processing_features/data/normalized_data/Plate_3_sc_norm.parquet b/3.processing_features/data/normalized_data/Plate_3_sc_norm.parquet index 8fd0145..a9377b2 100644 --- a/3.processing_features/data/normalized_data/Plate_3_sc_norm.parquet +++ b/3.processing_features/data/normalized_data/Plate_3_sc_norm.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58a48e626ff5518a9a7960c4c3aa5c09458d6ef7bbdd45d0badcf54821008075 -size 260676428 +oid sha256:9c839005fe53ec88db205a1d2afd0644389a9e21c57b4f595a5757a2dbfef6c7 +size 260684143 diff --git a/3.processing_features/data/normalized_data/Plate_4_sc_norm.parquet b/3.processing_features/data/normalized_data/Plate_4_sc_norm.parquet new file mode 100644 index 0000000..304a3ba --- /dev/null +++ b/3.processing_features/data/normalized_data/Plate_4_sc_norm.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cbe3342389967fa1180587e42c2090eb9964d4a3650c1a9e49358e360a4ea1e +size 104593543 From fb58cae8a639ecc1c8bccb40c39bdad44ea7e396 Mon Sep 17 00:00:00 2001 From: jenna-tomkinson Date: Wed, 19 Jul 2023 08:25:25 -0600 Subject: [PATCH 5/5] change items to keys for creating dictionary --- 3.processing_features/1.annotate_sc.ipynb | 2 +- 3.processing_features/scripts/1.annotate_sc.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/3.processing_features/1.annotate_sc.ipynb b/3.processing_features/1.annotate_sc.ipynb index ce2f2c2..1d535a5 100644 --- a/3.processing_features/1.annotate_sc.ipynb +++ b/3.processing_features/1.annotate_sc.ipynb @@ -98,7 +98,7 @@ ], "source": [ "# add path to platemaps for each plate \n", - "for plate, _ in plate_info_dictionary.items():\n", + "for plate in plate_info_dictionary.keys():\n", " # since Plate_3_prime has the same platemap as Plate_3, we need an else statement so that we make sure it adds the \n", " # path that was given to Plate_3\n", " if plate != \"Plate_3_prime\":\n", diff --git a/3.processing_features/scripts/1.annotate_sc.py b/3.processing_features/scripts/1.annotate_sc.py index aae8d5b..a006e4e 100644 --- a/3.processing_features/scripts/1.annotate_sc.py +++ b/3.processing_features/scripts/1.annotate_sc.py @@ -45,7 +45,7 @@ # add path to platemaps for each plate -for plate, _ in plate_info_dictionary.items(): +for plate in plate_info_dictionary.keys(): # since Plate_3_prime has the same platemap as Plate_3, we need an else statement so that we make sure it adds the # path that was given to Plate_3 if plate != "Plate_3_prime":