From dfe495d8cf6179115d3d9cc7f28c82a70ee8a2c2 Mon Sep 17 00:00:00 2001 From: gwaybio Date: Fri, 4 Aug 2023 13:00:48 -0600 Subject: [PATCH 01/10] ignore juypter checkpoints --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index de22cfd..6676234 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,6 @@ Corrected_Images # ignore Mac related files .DS_Store + +# ignore jupyter files +.ipynb_checkpoints/ From a88b79c099d252f5088f392ebd27ed5f62008561 Mon Sep 17 00:00:00 2001 From: gwaybio Date: Fri, 4 Aug 2023 13:02:11 -0600 Subject: [PATCH 02/10] update pycytominer hash, add jupyterlab, remove broken CP --- nf1_cellpainting_env.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nf1_cellpainting_env.yml b/nf1_cellpainting_env.yml index 0a6ece9..28b7eb6 100644 --- a/nf1_cellpainting_env.yml +++ b/nf1_cellpainting_env.yml @@ -7,6 +7,7 @@ dependencies: # this is restricted to Python 3.8 at this time due to other conflicts - conda-forge::python=3.8 - conda-forge::ipykernel +- conda-forge::jupyterlab - conda-forge::nbconvert=6.4.4 - conda-forge::pip - conda-forge::numpy @@ -27,6 +28,5 @@ dependencies: - conda-forge::wxpython=4.1.0 - conda-forge::sentry-sdk=0.18.0 - pip: - - cellprofiler==4.2.4 - git+https://github.com/cytomining/CytoTable@e3773183b8fd930a6513c65dd3e45418e9a5e80d - - git+https://github.com/cytomining/pycytominer@7e571322d211f66e716bf135c8b52898556fb960 + - git+https://github.com/cytomining/pycytominer@c4de0a9a4fecbf1ad11872bb14c18d24d1b1851e From 8b2c55d1f34ac0d5172054bfcdee0712253ce5b6 Mon Sep 17 00:00:00 2001 From: gwaybio Date: Fri, 4 Aug 2023 13:04:24 -0600 Subject: [PATCH 03/10] execute ipynb directly and chmod +x --- 3.processing_features/processing_features.sh | 25 +++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) mode change 100644 => 100755 3.processing_features/processing_features.sh diff --git a/3.processing_features/processing_features.sh b/3.processing_features/processing_features.sh old mode 100644 new mode 100755 index d60e318..9b3008e --- a/3.processing_features/processing_features.sh +++ b/3.processing_features/processing_features.sh @@ -2,15 +2,28 @@ # initialize the correct shell for your machine to allow conda to work (see README for note on shell names) conda init bash + # activate the main conda environment conda activate nf1_cellpainting_data # convert all notebooks to python files into the scripts folder jupyter nbconvert --to python --output-dir=scripts/ *.ipynb -# run the python scripts in order (from convert+merge, aggregate, annotate, normalize, and feature select) -python scripts/0.merge_sc_cytotable.py -python scripts/1.aggregate_sc.py -python scripts/2.annotate.py -python scripts/3.normalize.py -python scripts/4.feature_select.py +# run jupyter notebooks +jupyter nbconvert --to=html \ + --FilesWriter.build_directory=scripts/html \ + --ExecutePreprocessor.kernel_name=python3 \ + --ExecutePreprocessor.timeout=10000000 \ + --execute 0.merge_sc_cytotable + +jupyter nbconvert --to=html \ + --FilesWriter.build_directory=scripts/html \ + --ExecutePreprocessor.kernel_name=python3 \ + --ExecutePreprocessor.timeout=10000000 \ + --execute 1.pycytominer_bulk_pipelines.ipynb + +jupyter nbconvert --to=html \ + --FilesWriter.build_directory=scripts/html \ + --ExecutePreprocessor.kernel_name=python3 \ + --ExecutePreprocessor.timeout=10000000 \ + --execute 2.pycytominer_singlecell_pipelines.ipynb From 187fe220d26822746eeee5bb8613ea5b0f0fc01e Mon Sep 17 00:00:00 2001 From: gwaybio Date: Fri, 4 Aug 2023 13:05:27 -0600 Subject: [PATCH 04/10] clarify print --- .../0.merge_sc_cytotable.ipynb | 121 +++++++++++------- .../scripts/0.merge_sc_cytotable.py | 24 +++- 2 files changed, 91 insertions(+), 54 deletions(-) diff --git a/3.processing_features/0.merge_sc_cytotable.ipynb b/3.processing_features/0.merge_sc_cytotable.ipynb index 7a55319..887a5ed 100644 --- a/3.processing_features/0.merge_sc_cytotable.ipynb +++ b/3.processing_features/0.merge_sc_cytotable.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -9,7 +8,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -36,7 +34,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -47,7 +44,18 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['Plate_3_prime', 'Plate_1', 'Plate_4', 'Plate_3', 'Plate_2']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# type of file output from CytoTable (currently only parquet)\n", "dest_datatype = \"parquet\"\n", @@ -61,11 +69,23 @@ "\n", "# list for plate names based on folders to use to create dictionary\n", "plate_names = []\n", - "# iterate through 0.download_data and append plate names from folder names that contain image data from that plate\n", + "\n", + "# iterate through 0.download_data and append plate names from folder names\n", + "# that contain image data from that plate\n", + "# (Note, you must first run `0.download_data/download_plates.ipynb`)\n", "for file_path in pathlib.Path(\"../0.download_data/\").iterdir():\n", " if str(file_path.stem).startswith(\"Plate\"):\n", " plate_names.append(str(file_path.stem))\n", - "\n", + " \n", + "plate_names" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ "# preset configurations based on typical CellProfiler outputs\n", "preset = \"cellprofiler_sqlite_pycytominer\"\n", "# remove Image_Metadata_Plate from SELECT as this metadata was not extracted from file names\n", @@ -110,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -118,15 +138,15 @@ "output_type": "stream", "text": [ "{ 'Plate_1': { 'dest_path': 'data/converted_data/Plate_1.parquet',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},\n", " 'Plate_2': { 'dest_path': 'data/converted_data/Plate_2.parquet',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},\n", " 'Plate_3': { 'dest_path': 'data/converted_data/Plate_3.parquet',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},\n", " 'Plate_3_prime': { 'dest_path': 'data/converted_data/Plate_3_prime.parquet',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},\n", " 'Plate_4': { 'dest_path': 'data/converted_data/Plate_4.parquet',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite'}}\n" + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite'}}\n" ] } ], @@ -147,7 +167,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -156,28 +175,28 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Performing merge single cells and conversion on Plate_4!\n", - "Merged and converted Plate_4.parquet!\n", - "Added single cell count as metadata to Plate_4.parquet!\n", + "Performing merge single cells and conversion on Plate_3_prime!\n", + "Merged and converted Plate_3_prime.parquet!\n", + "Added single cell count as metadata to Plate_3_prime.parquet!\n", "Performing merge single cells and conversion on Plate_1!\n", "Merged and converted Plate_1.parquet!\n", "Added single cell count as metadata to Plate_1.parquet!\n", + "Performing merge single cells and conversion on Plate_4!\n", + "Merged and converted Plate_4.parquet!\n", + "Added single cell count as metadata to Plate_4.parquet!\n", "Performing merge single cells and conversion on Plate_3!\n", "Merged and converted Plate_3.parquet!\n", "Added single cell count as metadata to Plate_3.parquet!\n", "Performing merge single cells and conversion on Plate_2!\n", "Merged and converted Plate_2.parquet!\n", - "Added single cell count as metadata to Plate_2.parquet!\n", - "Performing merge single cells and conversion on Plate_3_prime!\n", - "Merged and converted Plate_3_prime.parquet!\n", - "Added single cell count as metadata to Plate_3_prime.parquet!\n" + "Added single cell count as metadata to Plate_2.parquet!\n" ] } ], @@ -186,6 +205,7 @@ "for plate, info in plate_info_dictionary.items():\n", " source_path = info[\"source_path\"]\n", " dest_path = info[\"dest_path\"]\n", + " \n", " print(f\"Performing merge single cells and conversion on {plate}!\")\n", "\n", " # merge single cells and output as parquet file\n", @@ -201,12 +221,20 @@ " sc_utils.add_sc_count_metadata_file(\n", " data_path=dest_path, well_column_name=\"Image_Metadata_Well\", file_type=\"parquet\"\n", " )\n", + " \n", " print(f\"Added single cell count as metadata to {pathlib.Path(dest_path).name}!\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Check if converted data looks correct" + ] + }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -387,64 +415,64 @@ "" ], "text/plain": [ - " Metadata_ImageNumber Image_Metadata_Plate Metadata_number_of_singlecells \n", - "0 1 Plate_4 81 \\\n", + " Metadata_ImageNumber Image_Metadata_Plate Metadata_number_of_singlecells \\\n", + "0 1 Plate_4 81 \n", "1 1 Plate_4 81 \n", "2 2 Plate_4 81 \n", "3 2 Plate_4 81 \n", "4 2 Plate_4 81 \n", "\n", - " Image_Metadata_Site Image_Metadata_Well \n", - "0 10 B10 \\\n", + " Image_Metadata_Site Image_Metadata_Well \\\n", + "0 10 B10 \n", "1 10 B10 \n", "2 11 B10 \n", "3 11 B10 \n", "4 11 B10 \n", "\n", - " Metadata_Cells_Number_Object_Number Metadata_Cytoplasm_Parent_Cells \n", - "0 4 4 \\\n", + " Metadata_Cells_Number_Object_Number Metadata_Cytoplasm_Parent_Cells \\\n", + "0 4 4 \n", "1 5 5 \n", "2 1 1 \n", "3 2 2 \n", "4 3 3 \n", "\n", - " Metadata_Cytoplasm_Parent_Nuclei Metadata_Nuclei_Number_Object_Number \n", - "0 5 5 \\\n", + " Metadata_Cytoplasm_Parent_Nuclei Metadata_Nuclei_Number_Object_Number \\\n", + "0 5 5 \n", "1 6 6 \n", "2 1 1 \n", "3 2 2 \n", "4 3 3 \n", "\n", - " Cytoplasm_AreaShape_Area ... Nuclei_Texture_Variance_DAPI_3_02_256 \n", - "0 22157.0 ... 1281.874186 \\\n", + " Cytoplasm_AreaShape_Area ... Nuclei_Texture_Variance_DAPI_3_02_256 \\\n", + "0 22157.0 ... 1281.874186 \n", "1 11718.0 ... 1085.750460 \n", "2 17501.0 ... 1273.428721 \n", "3 17871.0 ... 633.124457 \n", "4 12098.0 ... 894.732816 \n", "\n", - " Nuclei_Texture_Variance_DAPI_3_03_256 \n", - "0 1257.435761 \\\n", + " Nuclei_Texture_Variance_DAPI_3_03_256 \\\n", + "0 1257.435761 \n", "1 1113.144205 \n", "2 1246.970723 \n", "3 642.170387 \n", "4 829.273862 \n", "\n", - " Nuclei_Texture_Variance_GFP_3_00_256 Nuclei_Texture_Variance_GFP_3_01_256 \n", - "0 65.965695 52.068222 \\\n", + " Nuclei_Texture_Variance_GFP_3_00_256 Nuclei_Texture_Variance_GFP_3_01_256 \\\n", + "0 65.965695 52.068222 \n", "1 139.037112 140.802921 \n", "2 137.466776 111.514400 \n", "3 190.690537 173.126428 \n", "4 142.997128 131.232052 \n", "\n", - " Nuclei_Texture_Variance_GFP_3_02_256 Nuclei_Texture_Variance_GFP_3_03_256 \n", - "0 50.445780 51.851812 \\\n", + " Nuclei_Texture_Variance_GFP_3_02_256 Nuclei_Texture_Variance_GFP_3_03_256 \\\n", + "0 50.445780 51.851812 \n", "1 141.819546 149.091779 \n", "2 113.076080 118.810204 \n", "3 170.503677 178.200219 \n", "4 126.981214 128.412295 \n", "\n", - " Nuclei_Texture_Variance_RFP_3_00_256 Nuclei_Texture_Variance_RFP_3_01_256 \n", - "0 425.319446 409.351012 \\\n", + " Nuclei_Texture_Variance_RFP_3_00_256 Nuclei_Texture_Variance_RFP_3_01_256 \\\n", + "0 425.319446 409.351012 \n", "1 512.879573 499.756267 \n", "2 311.232220 306.768555 \n", "3 401.039364 412.623493 \n", @@ -460,7 +488,7 @@ "[5 rows x 2313 columns]" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -468,13 +496,11 @@ "source": [ "converted_df = pd.read_parquet(plate_info_dictionary[\"Plate_4\"][\"dest_path\"])\n", "\n", - "# load in and print a converted df to see if it looks correct\n", "print(converted_df.shape)\n", "converted_df.head()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -483,7 +509,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -495,7 +521,7 @@ ], "metadata": { "kernelspec": { - "display_name": "nf1_cellpainting_data", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -510,9 +536,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.15" - }, - "orig_nbformat": 4 + } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/3.processing_features/scripts/0.merge_sc_cytotable.py b/3.processing_features/scripts/0.merge_sc_cytotable.py index 01e7179..d9e6e1f 100644 --- a/3.processing_features/scripts/0.merge_sc_cytotable.py +++ b/3.processing_features/scripts/0.merge_sc_cytotable.py @@ -38,10 +38,19 @@ # list for plate names based on folders to use to create dictionary plate_names = [] -# iterate through 0.download_data and append plate names from folder names that contain image data from that plate + +# iterate through 0.download_data and append plate names from folder names +# that contain image data from that plate +# (Note, you must first run `0.download_data/download_plates.ipynb`) for file_path in pathlib.Path("../0.download_data/").iterdir(): if str(file_path.stem).startswith("Plate"): plate_names.append(str(file_path.stem)) + +plate_names + + +# In[3]: + # preset configurations based on typical CellProfiler outputs preset = "cellprofiler_sqlite_pycytominer" @@ -80,7 +89,7 @@ # # **Note:** All paths must be string to use with CytoTable. -# In[3]: +# In[4]: # create plate info dictionary with all parts of the CellProfiler CLI command to run in parallel @@ -100,13 +109,14 @@ # ## Merge objects to single cells and convert SQLite to parquet file + add single cell metadata -# In[4]: +# In[5]: # run through each run with each set of paths based on dictionary for plate, info in plate_info_dictionary.items(): source_path = info["source_path"] dest_path = info["dest_path"] + print(f"Performing merge single cells and conversion on {plate}!") # merge single cells and output as parquet file @@ -122,22 +132,24 @@ sc_utils.add_sc_count_metadata_file( data_path=dest_path, well_column_name="Image_Metadata_Well", file_type="parquet" ) + print(f"Added single cell count as metadata to {pathlib.Path(dest_path).name}!") -# In[5]: +# ### Check if converted data looks correct + +# In[6]: converted_df = pd.read_parquet(plate_info_dictionary["Plate_4"]["dest_path"]) -# load in and print a converted df to see if it looks correct print(converted_df.shape) converted_df.head() # ## Write dictionary to yaml file for use in downstream steps -# In[6]: +# In[7]: dictionary_path = pathlib.Path("./plate_info_dictionary.yaml") From e0f64ad87e6cf92e84cfbea9b165d1c66efbd86c Mon Sep 17 00:00:00 2001 From: gwaybio Date: Fri, 4 Aug 2023 13:10:03 -0600 Subject: [PATCH 05/10] remove old pycytominer processing and daat --- 3.processing_features/1.aggregate_sc.ipynb | 450 -------------- 3.processing_features/2.annotate.ipynb | 562 ----------------- 3.processing_features/3.normalize.ipynb | 512 ---------------- 3.processing_features/4.feature_select.ipynb | 565 ------------------ .../data/aggregated_data/Plate_1_bulk.parquet | 3 - .../data/aggregated_data/Plate_2_bulk.parquet | 3 - .../data/aggregated_data/Plate_3_bulk.parquet | 3 - .../Plate_3_prime_bulk.parquet | 3 - .../data/aggregated_data/Plate_4_bulk.parquet | 3 - .../Plate_1_bulk_annotated.parquet | 3 - .../data/annotated_data/Plate_1_sc.parquet | 3 - .../Plate_2_bulk_annotated.parquet | 3 - .../data/annotated_data/Plate_2_sc.parquet | 3 - .../Plate_3_bulk_annotated.parquet | 3 - .../Plate_3_prime_bulk_annotated.parquet | 3 - .../annotated_data/Plate_3_prime_sc.parquet | 3 - .../data/annotated_data/Plate_3_sc.parquet | 3 - .../Plate_4_bulk_annotated.parquet | 3 - .../data/annotated_data/Plate_4_sc.parquet | 3 - .../scripts/1.aggregate_sc.py | 82 --- 3.processing_features/scripts/2.annotate.py | 157 ----- 3.processing_features/scripts/3.normalize.py | 117 ---- .../scripts/4.feature_select.py | 126 ---- 23 files changed, 2616 deletions(-) delete mode 100644 3.processing_features/1.aggregate_sc.ipynb delete mode 100644 3.processing_features/2.annotate.ipynb delete mode 100644 3.processing_features/3.normalize.ipynb delete mode 100644 3.processing_features/4.feature_select.ipynb delete mode 100644 3.processing_features/data/aggregated_data/Plate_1_bulk.parquet delete mode 100644 3.processing_features/data/aggregated_data/Plate_2_bulk.parquet delete mode 100644 3.processing_features/data/aggregated_data/Plate_3_bulk.parquet delete mode 100644 3.processing_features/data/aggregated_data/Plate_3_prime_bulk.parquet delete mode 100644 3.processing_features/data/aggregated_data/Plate_4_bulk.parquet delete mode 100644 3.processing_features/data/annotated_data/Plate_1_bulk_annotated.parquet delete mode 100644 3.processing_features/data/annotated_data/Plate_1_sc.parquet delete mode 100644 3.processing_features/data/annotated_data/Plate_2_bulk_annotated.parquet delete mode 100644 3.processing_features/data/annotated_data/Plate_2_sc.parquet delete mode 100644 3.processing_features/data/annotated_data/Plate_3_bulk_annotated.parquet delete mode 100644 3.processing_features/data/annotated_data/Plate_3_prime_bulk_annotated.parquet delete mode 100644 3.processing_features/data/annotated_data/Plate_3_prime_sc.parquet delete mode 100644 3.processing_features/data/annotated_data/Plate_3_sc.parquet delete mode 100644 3.processing_features/data/annotated_data/Plate_4_bulk_annotated.parquet delete mode 100644 3.processing_features/data/annotated_data/Plate_4_sc.parquet delete mode 100644 3.processing_features/scripts/1.aggregate_sc.py delete mode 100644 3.processing_features/scripts/2.annotate.py delete mode 100644 3.processing_features/scripts/3.normalize.py delete mode 100644 3.processing_features/scripts/4.feature_select.py diff --git a/3.processing_features/1.aggregate_sc.ipynb b/3.processing_features/1.aggregate_sc.ipynb deleted file mode 100644 index 352fd03..0000000 --- a/3.processing_features/1.aggregate_sc.ipynb +++ /dev/null @@ -1,450 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Perform aggregation on all plates to output bulk profiles" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pathlib\n", - "import yaml\n", - "import pprint\n", - "\n", - "import pandas as pd\n", - "from pycytominer import aggregate\n", - "from pycytominer.cyto_utils import output" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set paths and variables" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{ 'Plate_1': { 'dest_path': 'data/converted_data/Plate_1.parquet',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},\n", - " 'Plate_2': { 'dest_path': 'data/converted_data/Plate_2.parquet',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},\n", - " 'Plate_3': { 'dest_path': 'data/converted_data/Plate_3.parquet',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},\n", - " 'Plate_3_prime': { 'dest_path': 'data/converted_data/Plate_3_prime.parquet',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},\n", - " 'Plate_4': { 'dest_path': 'data/converted_data/Plate_4.parquet',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite'}}\n" - ] - } - ], - "source": [ - "# output directory for annotated data\n", - "output_dir = pathlib.Path(\"./data/aggregated_data\")\n", - "output_dir.mkdir(exist_ok=True)\n", - "\n", - "# load in dicionary from yaml file\n", - "dictionary_path = pathlib.Path(\"./plate_info_dictionary.yaml\")\n", - "with open(dictionary_path) as file:\n", - " plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)\n", - "\n", - "# view the dictionary to assess that all info is added correctly\n", - "pprint.pprint(plate_info_dictionary, indent=4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Peform aggregation\n", - "\n", - "**Note:** We use the default operation of `median` for aggregating the single cell data." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Performing aggregation on Plate_1!\n", - "The bulk profile for Plate_1 has been created and saved!\n", - "Performing aggregation on Plate_2!\n", - "The bulk profile for Plate_2 has been created and saved!\n", - "Performing aggregation on Plate_3!\n", - "The bulk profile for Plate_3 has been created and saved!\n", - "Performing aggregation on Plate_3_prime!\n", - "The bulk profile for Plate_3_prime has been created and saved!\n", - "Performing aggregation on Plate_4!\n", - "The bulk profile for Plate_4 has been created and saved!\n" - ] - } - ], - "source": [ - "for plate, info in plate_info_dictionary.items():\n", - " # single_cell_df is the dataframe loaded in from the converted parquet file\n", - " single_cell_df = pd.read_parquet(info[\"dest_path\"])\n", - " output_file = str(pathlib.Path(f\"{output_dir}/{plate}_bulk.parquet\"))\n", - " # save path to annotated file to dictionary for downstream use\n", - " plate_info_dictionary[plate][\"bulk_path\"] = output_file\n", - " print(f\"Performing aggregation on {plate}!\")\n", - "\n", - " # perform median aggregation (default) to ouput bulk features\n", - " aggregate_df = aggregate(\n", - " population_df=single_cell_df, strata=[\"Image_Metadata_Plate\", \"Image_Metadata_Well\"]\n", - " )\n", - "\n", - " # save aggregated df as parquet file\n", - " output(\n", - " df=aggregate_df,\n", - " output_filename=output_file,\n", - " output_type=\"parquet\",\n", - " )\n", - " print(f\"The bulk profile for {plate} has been created and saved!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(60, 2306)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Image_Metadata_PlateImage_Metadata_WellCytoplasm_AreaShape_AreaCytoplasm_AreaShape_BoundingBoxAreaCytoplasm_AreaShape_BoundingBoxMaximum_XCytoplasm_AreaShape_BoundingBoxMaximum_YCytoplasm_AreaShape_BoundingBoxMinimum_XCytoplasm_AreaShape_BoundingBoxMinimum_YCytoplasm_AreaShape_Center_XCytoplasm_AreaShape_Center_Y...Nuclei_Texture_Variance_DAPI_3_02_256Nuclei_Texture_Variance_DAPI_3_03_256Nuclei_Texture_Variance_GFP_3_00_256Nuclei_Texture_Variance_GFP_3_01_256Nuclei_Texture_Variance_GFP_3_02_256Nuclei_Texture_Variance_GFP_3_03_256Nuclei_Texture_Variance_RFP_3_00_256Nuclei_Texture_Variance_RFP_3_01_256Nuclei_Texture_Variance_RFP_3_02_256Nuclei_Texture_Variance_RFP_3_03_256
0Plate_4B1018439.054378.0721.0596.0459.0344.0589.322731467.639272...1402.1767461390.439448343.450989339.348995361.993801354.468871402.157284408.136027406.956549401.988368
1Plate_4B1114858.043289.0792.5555.0607.5325.0693.164644450.720216...1403.5493441402.376148267.296946256.675966262.744632261.219410353.841985351.173117366.253475365.439917
2Plate_4B215746.042441.0705.0606.0453.0363.0599.574838505.885777...1360.1891031334.643687266.632008266.805205260.368846255.193937518.194881511.552822506.730303504.779113
3Plate_4B317612.546376.0728.0502.0479.5268.5604.916368374.651000...1510.0038891515.435019270.317856263.647022266.047190255.156976431.852250428.683538423.755592420.139912
4Plate_4B416847.548662.0734.0477.5522.0264.5627.978123372.343190...1519.3752801480.309106332.662770331.642057337.357941327.187538429.636663434.896642433.045802436.874798
\n", - "

5 rows × 2306 columns

\n", - "
" - ], - "text/plain": [ - " Image_Metadata_Plate Image_Metadata_Well Cytoplasm_AreaShape_Area \n", - "0 Plate_4 B10 18439.0 \\\n", - "1 Plate_4 B11 14858.0 \n", - "2 Plate_4 B2 15746.0 \n", - "3 Plate_4 B3 17612.5 \n", - "4 Plate_4 B4 16847.5 \n", - "\n", - " Cytoplasm_AreaShape_BoundingBoxArea \n", - "0 54378.0 \\\n", - "1 43289.0 \n", - "2 42441.0 \n", - "3 46376.0 \n", - "4 48662.0 \n", - "\n", - " Cytoplasm_AreaShape_BoundingBoxMaximum_X \n", - "0 721.0 \\\n", - "1 792.5 \n", - "2 705.0 \n", - "3 728.0 \n", - "4 734.0 \n", - "\n", - " Cytoplasm_AreaShape_BoundingBoxMaximum_Y \n", - "0 596.0 \\\n", - "1 555.0 \n", - "2 606.0 \n", - "3 502.0 \n", - "4 477.5 \n", - "\n", - " Cytoplasm_AreaShape_BoundingBoxMinimum_X \n", - "0 459.0 \\\n", - "1 607.5 \n", - "2 453.0 \n", - "3 479.5 \n", - "4 522.0 \n", - "\n", - " Cytoplasm_AreaShape_BoundingBoxMinimum_Y Cytoplasm_AreaShape_Center_X \n", - "0 344.0 589.322731 \\\n", - "1 325.0 693.164644 \n", - "2 363.0 599.574838 \n", - "3 268.5 604.916368 \n", - "4 264.5 627.978123 \n", - "\n", - " Cytoplasm_AreaShape_Center_Y ... Nuclei_Texture_Variance_DAPI_3_02_256 \n", - "0 467.639272 ... 1402.176746 \\\n", - "1 450.720216 ... 1403.549344 \n", - "2 505.885777 ... 1360.189103 \n", - "3 374.651000 ... 1510.003889 \n", - "4 372.343190 ... 1519.375280 \n", - "\n", - " Nuclei_Texture_Variance_DAPI_3_03_256 \n", - "0 1390.439448 \\\n", - "1 1402.376148 \n", - "2 1334.643687 \n", - "3 1515.435019 \n", - "4 1480.309106 \n", - "\n", - " Nuclei_Texture_Variance_GFP_3_00_256 Nuclei_Texture_Variance_GFP_3_01_256 \n", - "0 343.450989 339.348995 \\\n", - "1 267.296946 256.675966 \n", - "2 266.632008 266.805205 \n", - "3 270.317856 263.647022 \n", - "4 332.662770 331.642057 \n", - "\n", - " Nuclei_Texture_Variance_GFP_3_02_256 Nuclei_Texture_Variance_GFP_3_03_256 \n", - "0 361.993801 354.468871 \\\n", - "1 262.744632 261.219410 \n", - "2 260.368846 255.193937 \n", - "3 266.047190 255.156976 \n", - "4 337.357941 327.187538 \n", - "\n", - " Nuclei_Texture_Variance_RFP_3_00_256 Nuclei_Texture_Variance_RFP_3_01_256 \n", - "0 402.157284 408.136027 \\\n", - "1 353.841985 351.173117 \n", - "2 518.194881 511.552822 \n", - "3 431.852250 428.683538 \n", - "4 429.636663 434.896642 \n", - "\n", - " Nuclei_Texture_Variance_RFP_3_02_256 Nuclei_Texture_Variance_RFP_3_03_256 \n", - "0 406.956549 401.988368 \n", - "1 366.253475 365.439917 \n", - "2 506.730303 504.779113 \n", - "3 423.755592 420.139912 \n", - "4 433.045802 436.874798 \n", - "\n", - "[5 rows x 2306 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# print last aggregate df to see if annotation occurred\n", - "print(aggregate_df.shape)\n", - "aggregate_df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Write updated dictionary to yaml file for use in downstream steps" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "with open(dictionary_path, \"w\") as file:\n", - " yaml.dump(plate_info_dictionary, file)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nf1_cellpainting_data", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.15" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/3.processing_features/2.annotate.ipynb b/3.processing_features/2.annotate.ipynb deleted file mode 100644 index d05aa18..0000000 --- a/3.processing_features/2.annotate.ipynb +++ /dev/null @@ -1,562 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Annotate merged single cells with metadata from platemap file for each plate" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "import pathlib\n", - "import yaml\n", - "import pprint\n", - "\n", - "import pandas as pd\n", - "from pycytominer import annotate\n", - "from pycytominer.cyto_utils import output\n", - "\n", - "sys.path.append(\"../utils\")\n", - "import extraction_utils as sc_utils" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set paths and variables" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# output directory for annotated data\n", - "output_dir = pathlib.Path(\"./data/annotated_data\")\n", - "output_dir.mkdir(exist_ok=True)\n", - "\n", - "# directory with metadata\n", - "metadata_dir = pathlib.Path(\"../0.download_data/metadata/\")\n", - "\n", - "# load in dicionary from yaml file\n", - "dictionary_path = pathlib.Path(\"./plate_info_dictionary.yaml\")\n", - "with open(dictionary_path) as file:\n", - " plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add metadata paths to loaded in dictionary" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{ 'Plate_1': { 'annotated_path': 'data/annotated_data/Plate_1_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_1_bulk_annotated.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_1_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_1.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate1.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},\n", - " 'Plate_2': { 'annotated_path': 'data/annotated_data/Plate_2_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_2_bulk_annotated.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_2_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_2.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate2.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},\n", - " 'Plate_3': { 'annotated_path': 'data/annotated_data/Plate_3_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_3_bulk_annotated.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_3_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_3.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},\n", - " 'Plate_3_prime': { 'annotated_path': 'data/annotated_data/Plate_3_prime_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_3_prime_bulk_annotated.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_3_prime_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_3_prime.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},\n", - " 'Plate_4': { 'annotated_path': 'data/annotated_data/Plate_4_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_4_bulk_annotated.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_4_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_4.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate4.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite'}}\n" - ] - } - ], - "source": [ - "# add path to platemaps for each plate \n", - "for plate in plate_info_dictionary.keys():\n", - " # since Plate_3_prime has the same platemap as Plate_3, we need an else statement so that we make sure it adds the \n", - " # path that was given to Plate_3\n", - " if plate != \"Plate_3_prime\":\n", - " # match the naming format of the plates to the platemap file\n", - " plate_info_dictionary[plate][\"platemap_path\"] = str(\n", - " pathlib.Path(list(metadata_dir.rglob(f\"platemap_NF1_{plate.replace('_', '').lower()}.csv\"))[0]).resolve(\n", - " strict=True\n", - " )\n", - " )\n", - " else:\n", - " plate_info_dictionary[\"Plate_3_prime\"][\"platemap_path\"] = plate_info_dictionary[\"Plate_3\"][\"platemap_path\"]\n", - "\n", - "# view the dictionary to assess that all info is added correctly\n", - "pprint.pprint(plate_info_dictionary, indent=4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Annotate bulk profiles\n", - "\n", - "**Note:** The path to the annotated bulk file to be used for normalization is adding during this step." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Adding annotations to merged single cells for Plate_1!\n", - "Annotations have been added to Plate_1 bulk profiles and saved!\n", - "Adding annotations to merged single cells for Plate_2!\n", - "Annotations have been added to Plate_2 bulk profiles and saved!\n", - "Adding annotations to merged single cells for Plate_3!\n", - "Annotations have been added to Plate_3 bulk profiles and saved!\n", - "Adding annotations to merged single cells for Plate_3_prime!\n", - "Annotations have been added to Plate_3_prime bulk profiles and saved!\n", - "Adding annotations to merged single cells for Plate_4!\n", - "Annotations have been added to Plate_4 bulk profiles and saved!\n" - ] - } - ], - "source": [ - "for plate, info in plate_info_dictionary.items():\n", - " # single_cell_df is the dataframe loaded in from the converted parquet file\n", - " single_cell_df = pd.read_parquet(info[\"bulk_path\"])\n", - " platemap_df = pd.read_csv(info[\"platemap_path\"])\n", - " output_file = str(pathlib.Path(f\"{output_dir}/{plate}_bulk_annotated.parquet\"))\n", - " # save path to annotated file to dictionary for downstream use\n", - " plate_info_dictionary[plate][\"bulk_annotated_path\"] = output_file\n", - " print(f\"Adding annotations to merged single cells for {plate}!\")\n", - "\n", - " # add metadata from platemap file to extracted single cell features\n", - " annotated_df = annotate(\n", - " profiles=single_cell_df,\n", - " platemap=platemap_df,\n", - " join_on=[\"Metadata_well_position\", \"Image_Metadata_Well\"],\n", - " )\n", - "\n", - " # save annotated df as parquet file\n", - " output(\n", - " df=annotated_df,\n", - " output_filename=output_file,\n", - " output_type=\"parquet\",\n", - " )\n", - " print(f\"Annotations have been added to {plate} bulk profiles and saved!\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Annotate merged single cells\n", - "\n", - "**Note:** The path to the annotated file to be used for normalization is adding during this step." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Adding annotations to merged single cells for Plate_1!\n", - "Annotations have been added to Plate_1 and saved!\n", - "Adding annotations to merged single cells for Plate_2!\n", - "Annotations have been added to Plate_2 and saved!\n", - "Adding annotations to merged single cells for Plate_3!\n", - "Annotations have been added to Plate_3 and saved!\n", - "Adding annotations to merged single cells for Plate_3_prime!\n", - "Annotations have been added to Plate_3_prime and saved!\n", - "Adding annotations to merged single cells for Plate_4!\n", - "Annotations have been added to Plate_4 and saved!\n" - ] - } - ], - "source": [ - "for plate, info in plate_info_dictionary.items():\n", - " # single_cell_df is the dataframe loaded in from the converted parquet file\n", - " single_cell_df = pd.read_parquet(info[\"dest_path\"])\n", - " platemap_df = pd.read_csv(info[\"platemap_path\"])\n", - " output_file = str(pathlib.Path(f\"{output_dir}/{plate}_sc.parquet\"))\n", - " # save path to annotated file to dictionary for downstream use\n", - " plate_info_dictionary[plate][\"annotated_path\"] = output_file\n", - " print(f\"Adding annotations to merged single cells for {plate}!\")\n", - "\n", - " # add metadata from platemap file to extracted single cell features\n", - " annotated_df = annotate(\n", - " profiles=single_cell_df,\n", - " platemap=platemap_df,\n", - " join_on=[\"Metadata_well_position\", \"Image_Metadata_Well\"],\n", - " )\n", - "\n", - " # rename site column to avoid any issues with identifying the column as metadata over feature\n", - " annotated_df = annotated_df.rename(columns={\"Image_Metadata_Site\": \"Metadata_Site\"})\n", - "\n", - " # move metadata well, single cell count, and site to the front of the df (for easy visualization in python)\n", - " well_column = annotated_df.pop(\"Metadata_Well\")\n", - " singlecell_column = annotated_df.pop(\"Metadata_number_of_singlecells\")\n", - " site_column = annotated_df.pop(\"Metadata_Site\") \n", - "\n", - " # insert the columns in specific parts of the dataframe\n", - " annotated_df.insert(2, \"Metadata_Well\", well_column)\n", - " annotated_df.insert(3, \"Metadata_Site\", site_column)\n", - " annotated_df.insert(4, \"Metadata_number_of_singlecells\", singlecell_column)\n", - "\n", - " # save annotated df as parquet file\n", - " output(\n", - " df=annotated_df,\n", - " output_filename=output_file,\n", - " output_type=\"parquet\",\n", - " )\n", - " print(f\"Annotations have been added to {plate} and saved!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(7502, 2321)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Metadata_WellRowMetadata_WellColMetadata_WellMetadata_SiteMetadata_number_of_singlecellsMetadata_gene_nameMetadata_genotypeMetadata_seed_densityMetadata_siRNAMetadata_RNAiMax...Nuclei_Texture_Variance_DAPI_3_02_256Nuclei_Texture_Variance_DAPI_3_03_256Nuclei_Texture_Variance_GFP_3_00_256Nuclei_Texture_Variance_GFP_3_01_256Nuclei_Texture_Variance_GFP_3_02_256Nuclei_Texture_Variance_GFP_3_03_256Nuclei_Texture_Variance_RFP_3_00_256Nuclei_Texture_Variance_RFP_3_01_256Nuclei_Texture_Variance_RFP_3_02_256Nuclei_Texture_Variance_RFP_3_03_256
0B2B211115NF1WT1000NaN0...886.555259887.126996220.059165178.494944177.175567178.5363641947.6891171942.5052371929.8833491954.302782
1B2B211115NF1WT1000NaN0...2269.7396022183.665041197.803849181.889009190.645422197.228087224.801646219.118772235.706020228.832375
2B2B211115NF1WT1000NaN0...993.492352951.907468424.891364390.009779381.521734390.667328623.836472616.211140661.626364636.520133
3B2B211115NF1WT1000NaN0...2636.4247082517.017120561.624256544.408328548.657987556.701996364.792509365.727202437.796494380.910214
4B2B214115NF1WT1000NaN0...652.339728635.869580126.833034120.283432109.813358109.301294299.201582297.223402320.864494303.344477
\n", - "

5 rows × 2321 columns

\n", - "
" - ], - "text/plain": [ - " Metadata_WellRow Metadata_WellCol Metadata_Well Metadata_Site \n", - "0 B 2 B2 11 \\\n", - "1 B 2 B2 11 \n", - "2 B 2 B2 11 \n", - "3 B 2 B2 11 \n", - "4 B 2 B2 14 \n", - "\n", - " Metadata_number_of_singlecells Metadata_gene_name Metadata_genotype \n", - "0 115 NF1 WT \\\n", - "1 115 NF1 WT \n", - "2 115 NF1 WT \n", - "3 115 NF1 WT \n", - "4 115 NF1 WT \n", - "\n", - " Metadata_seed_density Metadata_siRNA Metadata_RNAiMax ... \n", - "0 1000 NaN 0 ... \\\n", - "1 1000 NaN 0 ... \n", - "2 1000 NaN 0 ... \n", - "3 1000 NaN 0 ... \n", - "4 1000 NaN 0 ... \n", - "\n", - " Nuclei_Texture_Variance_DAPI_3_02_256 \n", - "0 886.555259 \\\n", - "1 2269.739602 \n", - "2 993.492352 \n", - "3 2636.424708 \n", - "4 652.339728 \n", - "\n", - " Nuclei_Texture_Variance_DAPI_3_03_256 Nuclei_Texture_Variance_GFP_3_00_256 \n", - "0 887.126996 220.059165 \\\n", - "1 2183.665041 197.803849 \n", - "2 951.907468 424.891364 \n", - "3 2517.017120 561.624256 \n", - "4 635.869580 126.833034 \n", - "\n", - " Nuclei_Texture_Variance_GFP_3_01_256 Nuclei_Texture_Variance_GFP_3_02_256 \n", - "0 178.494944 177.175567 \\\n", - "1 181.889009 190.645422 \n", - "2 390.009779 381.521734 \n", - "3 544.408328 548.657987 \n", - "4 120.283432 109.813358 \n", - "\n", - " Nuclei_Texture_Variance_GFP_3_03_256 Nuclei_Texture_Variance_RFP_3_00_256 \n", - "0 178.536364 1947.689117 \\\n", - "1 197.228087 224.801646 \n", - "2 390.667328 623.836472 \n", - "3 556.701996 364.792509 \n", - "4 109.301294 299.201582 \n", - "\n", - " Nuclei_Texture_Variance_RFP_3_01_256 Nuclei_Texture_Variance_RFP_3_02_256 \n", - "0 1942.505237 1929.883349 \\\n", - "1 219.118772 235.706020 \n", - "2 616.211140 661.626364 \n", - "3 365.727202 437.796494 \n", - "4 297.223402 320.864494 \n", - "\n", - " Nuclei_Texture_Variance_RFP_3_03_256 \n", - "0 1954.302782 \n", - "1 228.832375 \n", - "2 636.520133 \n", - "3 380.910214 \n", - "4 303.344477 \n", - "\n", - "[5 rows x 2321 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# print last annotated df to see if annotation occurred\n", - "print(annotated_df.shape)\n", - "annotated_df.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Write updated dictionary to yaml file for use in downstream steps" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "with open(dictionary_path, \"w\") as file:\n", - " yaml.dump(plate_info_dictionary, file)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nf1_cellpainting_data", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.15" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/3.processing_features/3.normalize.ipynb b/3.processing_features/3.normalize.ipynb deleted file mode 100644 index 880e674..0000000 --- a/3.processing_features/3.normalize.ipynb +++ /dev/null @@ -1,512 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Normalize merged single cells with standardized method for each plate" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pathlib\n", - "import yaml\n", - "import pprint\n", - "\n", - "import pandas as pd\n", - "from pycytominer import normalize\n", - "from pycytominer.cyto_utils import output" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set paths and load in dictionary from annotated run" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'Plate_1': {'annotated_path': 'data/annotated_data/Plate_1_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_1_bulk_annotated.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_1_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_1.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate1.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},\n", - " 'Plate_2': {'annotated_path': 'data/annotated_data/Plate_2_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_2_bulk_annotated.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_2_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_2.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate2.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},\n", - " 'Plate_3': {'annotated_path': 'data/annotated_data/Plate_3_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_3_bulk_annotated.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_3_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_3.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},\n", - " 'Plate_3_prime': {'annotated_path': 'data/annotated_data/Plate_3_prime_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_3_prime_bulk_annotated.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_3_prime_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_3_prime.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},\n", - " 'Plate_4': {'annotated_path': 'data/annotated_data/Plate_4_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_4_bulk_annotated.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_4_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_4.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate4.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite'}}\n" - ] - } - ], - "source": [ - "# output directory for normalized data\n", - "output_dir = pathlib.Path(\"./data/normalized_data\")\n", - "output_dir.mkdir(exist_ok=True)\n", - "\n", - "# load in dicionary from yaml file\n", - "dictionary_path = pathlib.Path(\"./plate_info_dictionary.yaml\")\n", - "with open(dictionary_path) as file:\n", - " plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)\n", - "\n", - "# view the dictionary to assess that all info is added correctly\n", - "pprint.pprint(plate_info_dictionary)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Normalize annotated bulk profiles from each plate" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Normalizing annotated bulk profiles for Plate_1!\n", - "Bulk profiles have been normalized for Plate_1 and saved!\n", - "Normalizing annotated bulk profiles for Plate_2!\n", - "Bulk profiles have been normalized for Plate_2 and saved!\n", - "Normalizing annotated bulk profiles for Plate_3!\n", - "Bulk profiles have been normalized for Plate_3 and saved!\n", - "Normalizing annotated bulk profiles for Plate_3_prime!\n", - "Bulk profiles have been normalized for Plate_3_prime and saved!\n", - "Normalizing annotated bulk profiles for Plate_4!\n", - "Bulk profiles have been normalized for Plate_4 and saved!\n" - ] - } - ], - "source": [ - "# process each run\n", - "for plate, info in plate_info_dictionary.items():\n", - " annotated_df = pd.read_parquet(info[\"bulk_annotated_path\"])\n", - " # set output path and add to the dictionary\n", - " output_file = str(pathlib.Path(f\"{output_dir}/{plate}_bulk_norm.parquet\"))\n", - " # save path to normalized file to dictionary for downstream use\n", - " plate_info_dictionary[plate][\"bulk_normalized_path\"] = output_file\n", - " print(f\"Normalizing annotated bulk profiles for {plate}!\")\n", - "\n", - " # normalize annotated data\n", - " normalized_df = normalize(\n", - " # df with annotated raw merged single cell features\n", - " profiles=annotated_df,\n", - " # normalization method used\n", - " method=\"standardize\"\n", - " )\n", - "\n", - " # save df as parquet file\n", - " output(\n", - " df=normalized_df,\n", - " output_filename=output_file,\n", - " output_type=\"parquet\",\n", - " )\n", - " print(f\"Bulk profiles have been normalized for {plate} and saved!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Normalize annotated single cells from each plate\n", - "\n", - "**Note:** Path to normalized data for each plate is added to the dictionary in this step to be used during feature selection." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Normalizing annotated merged single cells for Plate_1!\n", - "Single cells have been normalized for Plate_1 and saved!\n", - "Normalizing annotated merged single cells for Plate_2!\n", - "Single cells have been normalized for Plate_2 and saved!\n", - "Normalizing annotated merged single cells for Plate_3!\n", - "Single cells have been normalized for Plate_3 and saved!\n", - "Normalizing annotated merged single cells for Plate_3_prime!\n", - "Single cells have been normalized for Plate_3_prime and saved!\n", - "Normalizing annotated merged single cells for Plate_4!\n", - "Single cells have been normalized for Plate_4 and saved!\n" - ] - } - ], - "source": [ - "# process each run\n", - "for plate, info in plate_info_dictionary.items():\n", - " annotated_df = pd.read_parquet(info[\"annotated_path\"])\n", - " # set output path and add to the dictionary\n", - " output_file = str(pathlib.Path(f\"{output_dir}/{plate}_sc_norm.parquet\"))\n", - " # save path to normalized file to dictionary for downstream use\n", - " plate_info_dictionary[plate][\"normalized_path\"] = output_file\n", - " print(f\"Normalizing annotated merged single cells for {plate}!\")\n", - "\n", - " # normalize annotated data\n", - " normalized_df = normalize(\n", - " # df with annotated raw merged single cell features\n", - " profiles=annotated_df,\n", - " # normalization method used\n", - " method=\"standardize\"\n", - " )\n", - "\n", - " # save df as parquet file\n", - " output(\n", - " df=normalized_df,\n", - " output_filename=output_file,\n", - " output_type=\"parquet\",\n", - " )\n", - " print(f\"Single cells have been normalized for {plate} and saved!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(7502, 2321)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Metadata_WellRowMetadata_WellColMetadata_WellMetadata_SiteMetadata_number_of_singlecellsMetadata_gene_nameMetadata_genotypeMetadata_seed_densityMetadata_siRNAMetadata_RNAiMax...Nuclei_Texture_Variance_DAPI_3_02_256Nuclei_Texture_Variance_DAPI_3_03_256Nuclei_Texture_Variance_GFP_3_00_256Nuclei_Texture_Variance_GFP_3_01_256Nuclei_Texture_Variance_GFP_3_02_256Nuclei_Texture_Variance_GFP_3_03_256Nuclei_Texture_Variance_RFP_3_00_256Nuclei_Texture_Variance_RFP_3_01_256Nuclei_Texture_Variance_RFP_3_02_256Nuclei_Texture_Variance_RFP_3_03_256
0B2B211115NF1WT1000None0...-1.015567-0.991013-0.427085-0.492375-0.505686-0.4924833.2879443.3243963.2342853.337852
1B2B211115NF1WT1000None0...1.2545951.181604-0.467826-0.486035-0.481063-0.457716-0.661907-0.668559-0.634783-0.644279
2B2B211115NF1WT1000None0...-0.840056-0.882460-0.052123-0.097227-0.132127-0.0979070.2529110.2514740.3379100.296604
3B2B211115NF1WT1000None0...1.8564201.7402040.1981780.1912180.1734100.210927-0.340967-0.328878-0.173260-0.293306
4B2B214115NF1WT1000None0...-1.399975-1.412046-0.597744-0.601125-0.628829-0.621264-0.491339-0.487596-0.440303-0.472316
\n", - "

5 rows × 2321 columns

\n", - "
" - ], - "text/plain": [ - " Metadata_WellRow Metadata_WellCol Metadata_Well Metadata_Site \n", - "0 B 2 B2 11 \\\n", - "1 B 2 B2 11 \n", - "2 B 2 B2 11 \n", - "3 B 2 B2 11 \n", - "4 B 2 B2 14 \n", - "\n", - " Metadata_number_of_singlecells Metadata_gene_name Metadata_genotype \n", - "0 115 NF1 WT \\\n", - "1 115 NF1 WT \n", - "2 115 NF1 WT \n", - "3 115 NF1 WT \n", - "4 115 NF1 WT \n", - "\n", - " Metadata_seed_density Metadata_siRNA Metadata_RNAiMax ... \n", - "0 1000 None 0 ... \\\n", - "1 1000 None 0 ... \n", - "2 1000 None 0 ... \n", - "3 1000 None 0 ... \n", - "4 1000 None 0 ... \n", - "\n", - " Nuclei_Texture_Variance_DAPI_3_02_256 \n", - "0 -1.015567 \\\n", - "1 1.254595 \n", - "2 -0.840056 \n", - "3 1.856420 \n", - "4 -1.399975 \n", - "\n", - " Nuclei_Texture_Variance_DAPI_3_03_256 Nuclei_Texture_Variance_GFP_3_00_256 \n", - "0 -0.991013 -0.427085 \\\n", - "1 1.181604 -0.467826 \n", - "2 -0.882460 -0.052123 \n", - "3 1.740204 0.198178 \n", - "4 -1.412046 -0.597744 \n", - "\n", - " Nuclei_Texture_Variance_GFP_3_01_256 Nuclei_Texture_Variance_GFP_3_02_256 \n", - "0 -0.492375 -0.505686 \\\n", - "1 -0.486035 -0.481063 \n", - "2 -0.097227 -0.132127 \n", - "3 0.191218 0.173410 \n", - "4 -0.601125 -0.628829 \n", - "\n", - " Nuclei_Texture_Variance_GFP_3_03_256 Nuclei_Texture_Variance_RFP_3_00_256 \n", - "0 -0.492483 3.287944 \\\n", - "1 -0.457716 -0.661907 \n", - "2 -0.097907 0.252911 \n", - "3 0.210927 -0.340967 \n", - "4 -0.621264 -0.491339 \n", - "\n", - " Nuclei_Texture_Variance_RFP_3_01_256 Nuclei_Texture_Variance_RFP_3_02_256 \n", - "0 3.324396 3.234285 \\\n", - "1 -0.668559 -0.634783 \n", - "2 0.251474 0.337910 \n", - "3 -0.328878 -0.173260 \n", - "4 -0.487596 -0.440303 \n", - "\n", - " Nuclei_Texture_Variance_RFP_3_03_256 \n", - "0 3.337852 \n", - "1 -0.644279 \n", - "2 0.296604 \n", - "3 -0.293306 \n", - "4 -0.472316 \n", - "\n", - "[5 rows x 2321 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# print last normalized df to see if looks like normalization has occurred\n", - "print(normalized_df.shape)\n", - "normalized_df.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Write updated dictionary to yaml file for use in downstream steps" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "with open(dictionary_path, 'w') as file:\n", - " yaml.dump(plate_info_dictionary, file)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nf1_cellpainting_data", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.15" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/3.processing_features/4.feature_select.ipynb b/3.processing_features/4.feature_select.ipynb deleted file mode 100644 index d7b6774..0000000 --- a/3.processing_features/4.feature_select.ipynb +++ /dev/null @@ -1,565 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Perform feature selection on normalized merged single cells for each plate" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pathlib\n", - "import yaml\n", - "import pprint\n", - "\n", - "import pandas as pd\n", - "from pycytominer import feature_select\n", - "from pycytominer.cyto_utils import output" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set paths and variables" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'Plate_1': {'annotated_path': 'data/annotated_data/Plate_1_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_1_bulk_annotated.parquet',\n", - " 'bulk_normalized_path': 'data/normalized_data/Plate_1_bulk_norm.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_1_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_1.parquet',\n", - " 'normalized_path': 'data/normalized_data/Plate_1_sc_norm.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate1.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},\n", - " 'Plate_2': {'annotated_path': 'data/annotated_data/Plate_2_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_2_bulk_annotated.parquet',\n", - " 'bulk_normalized_path': 'data/normalized_data/Plate_2_bulk_norm.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_2_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_2.parquet',\n", - " 'normalized_path': 'data/normalized_data/Plate_2_sc_norm.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate2.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},\n", - " 'Plate_3': {'annotated_path': 'data/annotated_data/Plate_3_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_3_bulk_annotated.parquet',\n", - " 'bulk_normalized_path': 'data/normalized_data/Plate_3_bulk_norm.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_3_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_3.parquet',\n", - " 'normalized_path': 'data/normalized_data/Plate_3_sc_norm.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},\n", - " 'Plate_3_prime': {'annotated_path': 'data/annotated_data/Plate_3_prime_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_3_prime_bulk_annotated.parquet',\n", - " 'bulk_normalized_path': 'data/normalized_data/Plate_3_prime_bulk_norm.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_3_prime_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_3_prime.parquet',\n", - " 'normalized_path': 'data/normalized_data/Plate_3_prime_sc_norm.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},\n", - " 'Plate_4': {'annotated_path': 'data/annotated_data/Plate_4_sc.parquet',\n", - " 'bulk_annotated_path': 'data/annotated_data/Plate_4_bulk_annotated.parquet',\n", - " 'bulk_normalized_path': 'data/normalized_data/Plate_4_bulk_norm.parquet',\n", - " 'bulk_path': 'data/aggregated_data/Plate_4_bulk.parquet',\n", - " 'dest_path': 'data/converted_data/Plate_4.parquet',\n", - " 'normalized_path': 'data/normalized_data/Plate_4_sc_norm.parquet',\n", - " 'platemap_path': '/home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate4.csv',\n", - " 'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite'}}\n" - ] - } - ], - "source": [ - "# output directory for feature selected data\n", - "output_dir = pathlib.Path(\"./data/feature_selected_data\")\n", - "output_dir.mkdir(exist_ok=True)\n", - "\n", - "# load in dicionary from yaml file\n", - "dictionary_path = pathlib.Path(\"./plate_info_dictionary.yaml\")\n", - "with open(dictionary_path) as file:\n", - " plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)\n", - "\n", - "# view the dictionary to assess that all info is added correctly\n", - "pprint.pprint(plate_info_dictionary)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Perform feature selection\n", - "\n", - "The operations that we are using for feature selection are:\n", - "\n", - "- `variance_threshold`: creates a list of excluded features that have very low varience of values between single cells\n", - " \n", - "- `correlation_threshold`: creates a list of excluded features with a correlation to at least one other feature greater than the default threshold (`threshold=0.9`)\n", - " \n", - "- `blocklist`: creates a list of excluded features using the [standard blocklist file](https://github.com/cytomining/pycytominer/blob/master/pycytominer/data/blocklist_features.txt) for CellProfiler features from Pycytominer\n", - "\n", - "For more information regarding these operations, please visit [the Pycytominer operations folder](https://github.com/cytomining/pycytominer/tree/master/pycytominer/operations) on GitHub.\n", - "To view how `blocklist` works, please visit [the separate file](https://github.com/cytomining/pycytominer/blob/a5ae6c81a275b692ef5d4c85cfeb37696bf69242/pycytominer/cyto_utils/features.py#L13) for that function." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set operations for feature selection" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# list of operations for feature select function to use on input profile\n", - "feature_select_ops = [\n", - " \"variance_threshold\",\n", - " \"correlation_threshold\",\n", - " \"blocklist\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Bulk profiles" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Performing feature selection on normalized annotated bulk profiles for Plate_1!\n", - "Features have been selected for Plate_1 bulk profiles and saved!\n", - "Performing feature selection on normalized annotated bulk profiles for Plate_2!\n", - "Features have been selected for Plate_2 bulk profiles and saved!\n", - "Performing feature selection on normalized annotated bulk profiles for Plate_3!\n", - "Features have been selected for Plate_3 bulk profiles and saved!\n", - "Performing feature selection on normalized annotated bulk profiles for Plate_3_prime!\n", - "Features have been selected for Plate_3_prime bulk profiles and saved!\n", - "Performing feature selection on normalized annotated bulk profiles for Plate_4!\n", - "Features have been selected for Plate_4 bulk profiles and saved!\n" - ] - } - ], - "source": [ - "# process each bulk run\n", - "for plate, info in plate_info_dictionary.items():\n", - " normalized_df = pd.read_parquet(info[\"bulk_normalized_path\"])\n", - " # output_file does not need to be saved to dictionary as there are no more processing steps after this\n", - " output_file = str(pathlib.Path(f\"{output_dir}/{plate}_bulk_norm_fs.parquet\"))\n", - " print(f\"Performing feature selection on normalized annotated bulk profiles for {plate}!\")\n", - "\n", - " # perform feature selection with the operations specified\n", - " feature_select_df = feature_select(\n", - " normalized_df,\n", - " operation=feature_select_ops,\n", - " output_file=\"none\",\n", - " )\n", - "\n", - " # save features selected df as parquet file\n", - " output(\n", - " df=feature_select_df,\n", - " output_filename=output_file,\n", - " output_type=\"parquet\"\n", - " )\n", - " print(f\"Features have been selected for {plate} bulk profiles and saved!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Single cell profiles" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Performing feature selection on normalized annotated merged single cells for Plate_1!\n", - "Features have been selected for Plate_1 and saved!\n", - "Performing feature selection on normalized annotated merged single cells for Plate_2!\n", - "Features have been selected for Plate_2 and saved!\n", - "Performing feature selection on normalized annotated merged single cells for Plate_3!\n", - "Features have been selected for Plate_3 and saved!\n", - "Performing feature selection on normalized annotated merged single cells for Plate_3_prime!\n", - "Features have been selected for Plate_3_prime and saved!\n", - "Performing feature selection on normalized annotated merged single cells for Plate_4!\n", - "Features have been selected for Plate_4 and saved!\n" - ] - } - ], - "source": [ - "# process each single cell run\n", - "for plate, info in plate_info_dictionary.items():\n", - " normalized_df = pd.read_parquet(info[\"normalized_path\"])\n", - " # output_file does not need to be saved to dictionary as there are no more processing steps after this\n", - " output_file = str(pathlib.Path(f\"{output_dir}/{plate}_sc_norm_fs.parquet\"))\n", - " print(f\"Performing feature selection on normalized annotated merged single cells for {plate}!\")\n", - "\n", - " # perform feature selection with the operations specified\n", - " feature_select_df = feature_select(\n", - " normalized_df,\n", - " operation=feature_select_ops,\n", - " output_file=\"none\",\n", - " )\n", - "\n", - " # save features selected df as parquet file\n", - " output(\n", - " df=feature_select_df,\n", - " output_filename=output_file,\n", - " output_type=\"parquet\"\n", - " )\n", - " print(f\"Features have been selected for {plate} and saved!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(7502, 1171)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Metadata_WellRowMetadata_WellColMetadata_WellMetadata_SiteMetadata_number_of_singlecellsMetadata_gene_nameMetadata_genotypeMetadata_seed_densityMetadata_siRNAMetadata_RNAiMax...Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256Nuclei_Texture_SumEntropy_DAPI_3_03_256Nuclei_Texture_SumEntropy_RFP_3_00_256Nuclei_Texture_SumVariance_CY5_3_01_256Nuclei_Texture_SumVariance_DAPI_3_01_256Nuclei_Texture_SumVariance_GFP_3_03_256Nuclei_Texture_SumVariance_RFP_3_01_256
0B2B211115NF1WT1000None0...-0.803359-0.853646-0.593777-0.2878160.0785652.150259-0.548795-1.056228-0.4735402.716088
1B2B211115NF1WT1000None0...-0.344102-0.0564890.159731-0.0539300.779409-0.907817-0.5624081.374649-0.434801-0.678851
2B2B211115NF1WT1000None0...-0.874338-0.629045-0.341210-0.5873090.1533770.525008-0.068374-1.011699-0.1321830.077051
3B2B211115NF1WT1000None0...-0.277489-0.2017720.572489-0.0889430.345108-0.546746-0.4008751.9029850.079753-0.402336
4B2B214115NF1WT1000None0...-0.695947-0.3793750.724884-0.371386-0.247454-0.395320-0.713150-1.447722-0.577792-0.543383
\n", - "

5 rows × 1171 columns

\n", - "
" - ], - "text/plain": [ - " Metadata_WellRow Metadata_WellCol Metadata_Well Metadata_Site \n", - "0 B 2 B2 11 \\\n", - "1 B 2 B2 11 \n", - "2 B 2 B2 11 \n", - "3 B 2 B2 11 \n", - "4 B 2 B2 14 \n", - "\n", - " Metadata_number_of_singlecells Metadata_gene_name Metadata_genotype \n", - "0 115 NF1 WT \\\n", - "1 115 NF1 WT \n", - "2 115 NF1 WT \n", - "3 115 NF1 WT \n", - "4 115 NF1 WT \n", - "\n", - " Metadata_seed_density Metadata_siRNA Metadata_RNAiMax ... \n", - "0 1000 None 0 ... \\\n", - "1 1000 None 0 ... \n", - "2 1000 None 0 ... \n", - "3 1000 None 0 ... \n", - "4 1000 None 0 ... \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256 \n", - "0 -0.803359 \\\n", - "1 -0.344102 \n", - "2 -0.874338 \n", - "3 -0.277489 \n", - "4 -0.695947 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256 \n", - "0 -0.853646 \\\n", - "1 -0.056489 \n", - "2 -0.629045 \n", - "3 -0.201772 \n", - "4 -0.379375 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256 \n", - "0 -0.593777 \\\n", - "1 0.159731 \n", - "2 -0.341210 \n", - "3 0.572489 \n", - "4 0.724884 \n", - "\n", - " Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256 \n", - "0 -0.287816 \\\n", - "1 -0.053930 \n", - "2 -0.587309 \n", - "3 -0.088943 \n", - "4 -0.371386 \n", - "\n", - " Nuclei_Texture_SumEntropy_DAPI_3_03_256 \n", - "0 0.078565 \\\n", - "1 0.779409 \n", - "2 0.153377 \n", - "3 0.345108 \n", - "4 -0.247454 \n", - "\n", - " Nuclei_Texture_SumEntropy_RFP_3_00_256 \n", - "0 2.150259 \\\n", - "1 -0.907817 \n", - "2 0.525008 \n", - "3 -0.546746 \n", - "4 -0.395320 \n", - "\n", - " Nuclei_Texture_SumVariance_CY5_3_01_256 \n", - "0 -0.548795 \\\n", - "1 -0.562408 \n", - "2 -0.068374 \n", - "3 -0.400875 \n", - "4 -0.713150 \n", - "\n", - " Nuclei_Texture_SumVariance_DAPI_3_01_256 \n", - "0 -1.056228 \\\n", - "1 1.374649 \n", - "2 -1.011699 \n", - "3 1.902985 \n", - "4 -1.447722 \n", - "\n", - " Nuclei_Texture_SumVariance_GFP_3_03_256 \n", - "0 -0.473540 \\\n", - "1 -0.434801 \n", - "2 -0.132183 \n", - "3 0.079753 \n", - "4 -0.577792 \n", - "\n", - " Nuclei_Texture_SumVariance_RFP_3_01_256 \n", - "0 2.716088 \n", - "1 -0.678851 \n", - "2 0.077051 \n", - "3 -0.402336 \n", - "4 -0.543383 \n", - "\n", - "[5 rows x 1171 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# print last feature selected df to assess if feature selection occurred (less columns)\n", - "print(feature_select_df.shape)\n", - "feature_select_df.head()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nf1_cellpainting_data", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.15" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/3.processing_features/data/aggregated_data/Plate_1_bulk.parquet b/3.processing_features/data/aggregated_data/Plate_1_bulk.parquet deleted file mode 100644 index 430fd98..0000000 --- a/3.processing_features/data/aggregated_data/Plate_1_bulk.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cffb82b3d3108bbd282cbfd366c3101b67b7bb708a175c377eb0034d91da4c15 -size 1751351 diff --git a/3.processing_features/data/aggregated_data/Plate_2_bulk.parquet b/3.processing_features/data/aggregated_data/Plate_2_bulk.parquet deleted file mode 100644 index 39c2bb2..0000000 --- a/3.processing_features/data/aggregated_data/Plate_2_bulk.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:85e6f2e761dc103bd8927fa66177f1c3ce6d04c7aefd17a67ecc7d69495a5cd9 -size 2087172 diff --git a/3.processing_features/data/aggregated_data/Plate_3_bulk.parquet b/3.processing_features/data/aggregated_data/Plate_3_bulk.parquet deleted file mode 100644 index a2ff0b5..0000000 --- a/3.processing_features/data/aggregated_data/Plate_3_bulk.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4e5a9328fe7c4dce04775d7f0a5439f3e65cbca4b9f17453ff67f63cb7a9e984 -size 3578872 diff --git a/3.processing_features/data/aggregated_data/Plate_3_prime_bulk.parquet b/3.processing_features/data/aggregated_data/Plate_3_prime_bulk.parquet deleted file mode 100644 index 803caff..0000000 --- a/3.processing_features/data/aggregated_data/Plate_3_prime_bulk.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b7706570e4bf507b01a9ca7bcf53edca0cdc94a3628d2961074e6459fc6b8cff -size 3576497 diff --git a/3.processing_features/data/aggregated_data/Plate_4_bulk.parquet b/3.processing_features/data/aggregated_data/Plate_4_bulk.parquet deleted file mode 100644 index 3ca69ba..0000000 --- a/3.processing_features/data/aggregated_data/Plate_4_bulk.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:33f8276a7196f49f462a7320a0711df675eea65797e5bde6ad04bd8f994ec425 -size 3315389 diff --git a/3.processing_features/data/annotated_data/Plate_1_bulk_annotated.parquet b/3.processing_features/data/annotated_data/Plate_1_bulk_annotated.parquet deleted file mode 100644 index f6f45c5..0000000 --- a/3.processing_features/data/annotated_data/Plate_1_bulk_annotated.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c44ce53fb871b486ac6024d799a75f615015e3770f6b6611794e66322c8a10e2 -size 1753937 diff --git a/3.processing_features/data/annotated_data/Plate_1_sc.parquet b/3.processing_features/data/annotated_data/Plate_1_sc.parquet deleted file mode 100644 index da78a18..0000000 --- a/3.processing_features/data/annotated_data/Plate_1_sc.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bafef1e769a223d2277de6682b3c8c07169f48a01cfc460935953b82d1a72520 -size 5123338 diff --git a/3.processing_features/data/annotated_data/Plate_2_bulk_annotated.parquet b/3.processing_features/data/annotated_data/Plate_2_bulk_annotated.parquet deleted file mode 100644 index b695276..0000000 --- a/3.processing_features/data/annotated_data/Plate_2_bulk_annotated.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:796e03a9fbaf6e7fcf97ae91d1d4d4ffd32101b7ecd72b72a6777021b60bafcb -size 2089805 diff --git a/3.processing_features/data/annotated_data/Plate_2_sc.parquet b/3.processing_features/data/annotated_data/Plate_2_sc.parquet deleted file mode 100644 index 5f4d656..0000000 --- a/3.processing_features/data/annotated_data/Plate_2_sc.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d8814cfbf93d9cee3ea3219c8b25c6383bb46b388a32b0462bf668dfab7f2340 -size 26815268 diff --git a/3.processing_features/data/annotated_data/Plate_3_bulk_annotated.parquet b/3.processing_features/data/annotated_data/Plate_3_bulk_annotated.parquet deleted file mode 100644 index 714062a..0000000 --- a/3.processing_features/data/annotated_data/Plate_3_bulk_annotated.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5f6c02d09fcb2a066dddd07c2ff540c08bf060bf47e141a6d11d78f6970219d9 -size 3582148 diff --git a/3.processing_features/data/annotated_data/Plate_3_prime_bulk_annotated.parquet b/3.processing_features/data/annotated_data/Plate_3_prime_bulk_annotated.parquet deleted file mode 100644 index be0ec6e..0000000 --- a/3.processing_features/data/annotated_data/Plate_3_prime_bulk_annotated.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dc39aec078714e78356f39f2bcdf894c4659c4d4d2a041c89cc0e532216aff1e -size 3579582 diff --git a/3.processing_features/data/annotated_data/Plate_3_prime_sc.parquet b/3.processing_features/data/annotated_data/Plate_3_prime_sc.parquet deleted file mode 100644 index 843d94c..0000000 --- a/3.processing_features/data/annotated_data/Plate_3_prime_sc.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8631fb1933d26df86461a79db8979aa55119d72d841d92feabcdaa774846585f -size 294391555 diff --git a/3.processing_features/data/annotated_data/Plate_3_sc.parquet b/3.processing_features/data/annotated_data/Plate_3_sc.parquet deleted file mode 100644 index bcb6cdc..0000000 --- a/3.processing_features/data/annotated_data/Plate_3_sc.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f2b63ab85c3f2f33ddce0d79a2c62af12787d11a7267e4a469e5b1286b40f7bc -size 379244379 diff --git a/3.processing_features/data/annotated_data/Plate_4_bulk_annotated.parquet b/3.processing_features/data/annotated_data/Plate_4_bulk_annotated.parquet deleted file mode 100644 index d640fec..0000000 --- a/3.processing_features/data/annotated_data/Plate_4_bulk_annotated.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6bf460f39f537805401689254eaae50e4ea86a000008281828ccf2b563d14920 -size 3321007 diff --git a/3.processing_features/data/annotated_data/Plate_4_sc.parquet b/3.processing_features/data/annotated_data/Plate_4_sc.parquet deleted file mode 100644 index a6ceb1a..0000000 --- a/3.processing_features/data/annotated_data/Plate_4_sc.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cacf2f91bf90cf8eb11250c2aaf1c23e85c715e369ede63763368ea3741e6a7b -size 151581085 diff --git a/3.processing_features/scripts/1.aggregate_sc.py b/3.processing_features/scripts/1.aggregate_sc.py deleted file mode 100644 index 62dfa18..0000000 --- a/3.processing_features/scripts/1.aggregate_sc.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# # Perform aggregation on all plates to output bulk profiles - -# ## Import libraries - -# In[1]: - - -import pathlib -import yaml -import pprint - -import pandas as pd -from pycytominer import aggregate -from pycytominer.cyto_utils import output - - -# ## Set paths and variables - -# In[2]: - - -# output directory for annotated data -output_dir = pathlib.Path("./data/aggregated_data") -output_dir.mkdir(exist_ok=True) - -# load in dicionary from yaml file -dictionary_path = pathlib.Path("./plate_info_dictionary.yaml") -with open(dictionary_path) as file: - plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader) - -# view the dictionary to assess that all info is added correctly -pprint.pprint(plate_info_dictionary, indent=4) - - -# ## Peform aggregation -# -# **Note:** We use the default operation of `median` for aggregating the single cell data. - -# In[3]: - - -for plate, info in plate_info_dictionary.items(): - # single_cell_df is the dataframe loaded in from the converted parquet file - single_cell_df = pd.read_parquet(info["dest_path"]) - output_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk.parquet")) - # save path to annotated file to dictionary for downstream use - plate_info_dictionary[plate]["bulk_path"] = output_file - print(f"Performing aggregation on {plate}!") - - # perform median aggregation (default) to ouput bulk features - aggregate_df = aggregate( - population_df=single_cell_df, strata=["Image_Metadata_Plate", "Image_Metadata_Well"] - ) - - # save aggregated df as parquet file - output( - df=aggregate_df, - output_filename=output_file, - output_type="parquet", - ) - print(f"The bulk profile for {plate} has been created and saved!") - - -# In[4]: - - -# print last aggregate df to see if annotation occurred -print(aggregate_df.shape) -aggregate_df.head() - - -# ## Write updated dictionary to yaml file for use in downstream steps - -# In[5]: - - -with open(dictionary_path, "w") as file: - yaml.dump(plate_info_dictionary, file) - diff --git a/3.processing_features/scripts/2.annotate.py b/3.processing_features/scripts/2.annotate.py deleted file mode 100644 index 0d00660..0000000 --- a/3.processing_features/scripts/2.annotate.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# # Annotate merged single cells with metadata from platemap file for each plate - -# ## Import libraries - -# In[1]: - - -import sys -import pathlib -import yaml -import pprint - -import pandas as pd -from pycytominer import annotate -from pycytominer.cyto_utils import output - -sys.path.append("../utils") -import extraction_utils as sc_utils - - -# ## Set paths and variables - -# In[2]: - - -# output directory for annotated data -output_dir = pathlib.Path("./data/annotated_data") -output_dir.mkdir(exist_ok=True) - -# directory with metadata -metadata_dir = pathlib.Path("../0.download_data/metadata/") - -# load in dicionary from yaml file -dictionary_path = pathlib.Path("./plate_info_dictionary.yaml") -with open(dictionary_path) as file: - plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader) - - -# ## Add metadata paths to loaded in dictionary - -# In[3]: - - -# add path to platemaps for each plate -for plate in plate_info_dictionary.keys(): - # since Plate_3_prime has the same platemap as Plate_3, we need an else statement so that we make sure it adds the - # path that was given to Plate_3 - if plate != "Plate_3_prime": - # match the naming format of the plates to the platemap file - plate_info_dictionary[plate]["platemap_path"] = str( - pathlib.Path(list(metadata_dir.rglob(f"platemap_NF1_{plate.replace('_', '').lower()}.csv"))[0]).resolve( - strict=True - ) - ) - else: - plate_info_dictionary["Plate_3_prime"]["platemap_path"] = plate_info_dictionary["Plate_3"]["platemap_path"] - -# view the dictionary to assess that all info is added correctly -pprint.pprint(plate_info_dictionary, indent=4) - - -# ## Annotate bulk profiles -# -# **Note:** The path to the annotated bulk file to be used for normalization is adding during this step. - -# In[4]: - - -for plate, info in plate_info_dictionary.items(): - # single_cell_df is the dataframe loaded in from the converted parquet file - single_cell_df = pd.read_parquet(info["bulk_path"]) - platemap_df = pd.read_csv(info["platemap_path"]) - output_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk_annotated.parquet")) - # save path to annotated file to dictionary for downstream use - plate_info_dictionary[plate]["bulk_annotated_path"] = output_file - print(f"Adding annotations to merged single cells for {plate}!") - - # add metadata from platemap file to extracted single cell features - annotated_df = annotate( - profiles=single_cell_df, - platemap=platemap_df, - join_on=["Metadata_well_position", "Image_Metadata_Well"], - ) - - # save annotated df as parquet file - output( - df=annotated_df, - output_filename=output_file, - output_type="parquet", - ) - print(f"Annotations have been added to {plate} bulk profiles and saved!") - - -# ## Annotate merged single cells -# -# **Note:** The path to the annotated file to be used for normalization is adding during this step. - -# In[5]: - - -for plate, info in plate_info_dictionary.items(): - # single_cell_df is the dataframe loaded in from the converted parquet file - single_cell_df = pd.read_parquet(info["dest_path"]) - platemap_df = pd.read_csv(info["platemap_path"]) - output_file = str(pathlib.Path(f"{output_dir}/{plate}_sc.parquet")) - # save path to annotated file to dictionary for downstream use - plate_info_dictionary[plate]["annotated_path"] = output_file - print(f"Adding annotations to merged single cells for {plate}!") - - # add metadata from platemap file to extracted single cell features - annotated_df = annotate( - profiles=single_cell_df, - platemap=platemap_df, - join_on=["Metadata_well_position", "Image_Metadata_Well"], - ) - - # rename site column to avoid any issues with identifying the column as metadata over feature - annotated_df = annotated_df.rename(columns={"Image_Metadata_Site": "Metadata_Site"}) - - # move metadata well, single cell count, and site to the front of the df (for easy visualization in python) - well_column = annotated_df.pop("Metadata_Well") - singlecell_column = annotated_df.pop("Metadata_number_of_singlecells") - site_column = annotated_df.pop("Metadata_Site") - - # insert the columns in specific parts of the dataframe - annotated_df.insert(2, "Metadata_Well", well_column) - annotated_df.insert(3, "Metadata_Site", site_column) - annotated_df.insert(4, "Metadata_number_of_singlecells", singlecell_column) - - # save annotated df as parquet file - output( - df=annotated_df, - output_filename=output_file, - output_type="parquet", - ) - print(f"Annotations have been added to {plate} and saved!") - - -# In[6]: - - -# print last annotated df to see if annotation occurred -print(annotated_df.shape) -annotated_df.head() - - -# ## Write updated dictionary to yaml file for use in downstream steps - -# In[7]: - - -with open(dictionary_path, "w") as file: - yaml.dump(plate_info_dictionary, file) - diff --git a/3.processing_features/scripts/3.normalize.py b/3.processing_features/scripts/3.normalize.py deleted file mode 100644 index 9a5d9e1..0000000 --- a/3.processing_features/scripts/3.normalize.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# ## Normalize merged single cells with standardized method for each plate - -# ## Import libraries - -# In[1]: - - -import pathlib -import yaml -import pprint - -import pandas as pd -from pycytominer import normalize -from pycytominer.cyto_utils import output - - -# ## Set paths and load in dictionary from annotated run - -# In[2]: - - -# output directory for normalized data -output_dir = pathlib.Path("./data/normalized_data") -output_dir.mkdir(exist_ok=True) - -# load in dicionary from yaml file -dictionary_path = pathlib.Path("./plate_info_dictionary.yaml") -with open(dictionary_path) as file: - plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader) - -# view the dictionary to assess that all info is added correctly -pprint.pprint(plate_info_dictionary) - - -# ## Normalize annotated bulk profiles from each plate - -# In[3]: - - -# process each run -for plate, info in plate_info_dictionary.items(): - annotated_df = pd.read_parquet(info["bulk_annotated_path"]) - # set output path and add to the dictionary - output_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk_norm.parquet")) - # save path to normalized file to dictionary for downstream use - plate_info_dictionary[plate]["bulk_normalized_path"] = output_file - print(f"Normalizing annotated bulk profiles for {plate}!") - - # normalize annotated data - normalized_df = normalize( - # df with annotated raw merged single cell features - profiles=annotated_df, - # normalization method used - method="standardize" - ) - - # save df as parquet file - output( - df=normalized_df, - output_filename=output_file, - output_type="parquet", - ) - print(f"Bulk profiles have been normalized for {plate} and saved!") - - -# ## Normalize annotated single cells from each plate -# -# **Note:** Path to normalized data for each plate is added to the dictionary in this step to be used during feature selection. - -# In[4]: - - -# process each run -for plate, info in plate_info_dictionary.items(): - annotated_df = pd.read_parquet(info["annotated_path"]) - # set output path and add to the dictionary - output_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_norm.parquet")) - # save path to normalized file to dictionary for downstream use - plate_info_dictionary[plate]["normalized_path"] = output_file - print(f"Normalizing annotated merged single cells for {plate}!") - - # normalize annotated data - normalized_df = normalize( - # df with annotated raw merged single cell features - profiles=annotated_df, - # normalization method used - method="standardize" - ) - - # save df as parquet file - output( - df=normalized_df, - output_filename=output_file, - output_type="parquet", - ) - print(f"Single cells have been normalized for {plate} and saved!") - - -# In[5]: - - -# print last normalized df to see if looks like normalization has occurred -print(normalized_df.shape) -normalized_df.head() - - -# ## Write updated dictionary to yaml file for use in downstream steps - -# In[6]: - - -with open(dictionary_path, 'w') as file: - yaml.dump(plate_info_dictionary, file) - diff --git a/3.processing_features/scripts/4.feature_select.py b/3.processing_features/scripts/4.feature_select.py deleted file mode 100644 index 88e97a0..0000000 --- a/3.processing_features/scripts/4.feature_select.py +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# # Perform feature selection on normalized merged single cells for each plate - -# ## Import libraries - -# In[1]: - - -import pathlib -import yaml -import pprint - -import pandas as pd -from pycytominer import feature_select -from pycytominer.cyto_utils import output - - -# ## Set paths and variables - -# In[2]: - - -# output directory for feature selected data -output_dir = pathlib.Path("./data/feature_selected_data") -output_dir.mkdir(exist_ok=True) - -# load in dicionary from yaml file -dictionary_path = pathlib.Path("./plate_info_dictionary.yaml") -with open(dictionary_path) as file: - plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader) - -# view the dictionary to assess that all info is added correctly -pprint.pprint(plate_info_dictionary) - - -# ## Perform feature selection -# -# The operations that we are using for feature selection are: -# -# - `variance_threshold`: creates a list of excluded features that have very low varience of values between single cells -# -# - `correlation_threshold`: creates a list of excluded features with a correlation to at least one other feature greater than the default threshold (`threshold=0.9`) -# -# - `blocklist`: creates a list of excluded features using the [standard blocklist file](https://github.com/cytomining/pycytominer/blob/master/pycytominer/data/blocklist_features.txt) for CellProfiler features from Pycytominer -# -# For more information regarding these operations, please visit [the Pycytominer operations folder](https://github.com/cytomining/pycytominer/tree/master/pycytominer/operations) on GitHub. -# To view how `blocklist` works, please visit [the separate file](https://github.com/cytomining/pycytominer/blob/a5ae6c81a275b692ef5d4c85cfeb37696bf69242/pycytominer/cyto_utils/features.py#L13) for that function. - -# ### Set operations for feature selection - -# In[3]: - - -# list of operations for feature select function to use on input profile -feature_select_ops = [ - "variance_threshold", - "correlation_threshold", - "blocklist", -] - - -# ### Bulk profiles - -# In[4]: - - -# process each bulk run -for plate, info in plate_info_dictionary.items(): - normalized_df = pd.read_parquet(info["bulk_normalized_path"]) - # output_file does not need to be saved to dictionary as there are no more processing steps after this - output_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk_norm_fs.parquet")) - print(f"Performing feature selection on normalized annotated bulk profiles for {plate}!") - - # perform feature selection with the operations specified - feature_select_df = feature_select( - normalized_df, - operation=feature_select_ops, - output_file="none", - ) - - # save features selected df as parquet file - output( - df=feature_select_df, - output_filename=output_file, - output_type="parquet" - ) - print(f"Features have been selected for {plate} bulk profiles and saved!") - - -# ### Single cell profiles - -# In[5]: - - -# process each single cell run -for plate, info in plate_info_dictionary.items(): - normalized_df = pd.read_parquet(info["normalized_path"]) - # output_file does not need to be saved to dictionary as there are no more processing steps after this - output_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_norm_fs.parquet")) - print(f"Performing feature selection on normalized annotated merged single cells for {plate}!") - - # perform feature selection with the operations specified - feature_select_df = feature_select( - normalized_df, - operation=feature_select_ops, - output_file="none", - ) - - # save features selected df as parquet file - output( - df=feature_select_df, - output_filename=output_file, - output_type="parquet" - ) - print(f"Features have been selected for {plate} and saved!") - - -# In[6]: - - -# print last feature selected df to assess if feature selection occurred (less columns) -print(feature_select_df.shape) -feature_select_df.head() - From b14357b989195b850de3dc9698f1f9cd857ef47c Mon Sep 17 00:00:00 2001 From: gwaybio Date: Fri, 4 Aug 2023 13:11:44 -0600 Subject: [PATCH 06/10] add reprocessed data --- 3.processing_features/data/bulk_profiles/Plate_1_bulk.parquet | 3 +++ .../data/bulk_profiles/Plate_1_bulk_annotated.parquet | 3 +++ .../data/bulk_profiles/Plate_1_bulk_feature_selected.parquet | 3 +++ .../data/bulk_profiles/Plate_1_bulk_normalized.parquet | 3 +++ 3.processing_features/data/bulk_profiles/Plate_2_bulk.parquet | 3 +++ .../data/bulk_profiles/Plate_2_bulk_annotated.parquet | 3 +++ .../data/bulk_profiles/Plate_2_bulk_feature_selected.parquet | 3 +++ .../data/bulk_profiles/Plate_2_bulk_normalized.parquet | 3 +++ 3.processing_features/data/bulk_profiles/Plate_3_bulk.parquet | 3 +++ .../data/bulk_profiles/Plate_3_bulk_annotated.parquet | 3 +++ .../data/bulk_profiles/Plate_3_bulk_feature_selected.parquet | 3 +++ .../data/bulk_profiles/Plate_3_bulk_normalized.parquet | 3 +++ .../data/bulk_profiles/Plate_3_prime_bulk.parquet | 3 +++ .../data/bulk_profiles/Plate_3_prime_bulk_annotated.parquet | 3 +++ .../bulk_profiles/Plate_3_prime_bulk_feature_selected.parquet | 3 +++ .../data/bulk_profiles/Plate_3_prime_bulk_normalized.parquet | 3 +++ 3.processing_features/data/bulk_profiles/Plate_4_bulk.parquet | 3 +++ .../data/bulk_profiles/Plate_4_bulk_annotated.parquet | 3 +++ .../data/bulk_profiles/Plate_4_bulk_feature_selected.parquet | 3 +++ .../data/bulk_profiles/Plate_4_bulk_normalized.parquet | 3 +++ 3.processing_features/data/converted_data/Plate_1.parquet | 2 +- 3.processing_features/data/converted_data/Plate_2.parquet | 2 +- 3.processing_features/data/converted_data/Plate_3.parquet | 2 +- .../data/converted_data/Plate_3_prime.parquet | 2 +- 3.processing_features/data/converted_data/Plate_4.parquet | 2 +- .../single_cell_profiles/Plate_1_bulk_camerons_method.parquet | 3 +++ .../data/single_cell_profiles/Plate_1_sc_annotated.parquet | 3 +++ .../single_cell_profiles/Plate_1_sc_feature_selected.parquet | 3 +++ .../data/single_cell_profiles/Plate_1_sc_normalized.parquet | 3 +++ .../single_cell_profiles/Plate_2_bulk_camerons_method.parquet | 3 +++ .../data/single_cell_profiles/Plate_2_sc_annotated.parquet | 3 +++ .../single_cell_profiles/Plate_2_sc_feature_selected.parquet | 3 +++ .../data/single_cell_profiles/Plate_2_sc_normalized.parquet | 3 +++ .../single_cell_profiles/Plate_3_bulk_camerons_method.parquet | 3 +++ .../Plate_3_prime_bulk_camerons_method.parquet | 3 +++ .../single_cell_profiles/Plate_3_prime_sc_annotated.parquet | 3 +++ .../Plate_3_prime_sc_feature_selected.parquet | 3 +++ .../single_cell_profiles/Plate_3_prime_sc_normalized.parquet | 3 +++ .../data/single_cell_profiles/Plate_3_sc_annotated.parquet | 3 +++ .../single_cell_profiles/Plate_3_sc_feature_selected.parquet | 3 +++ .../data/single_cell_profiles/Plate_3_sc_normalized.parquet | 3 +++ .../single_cell_profiles/Plate_4_bulk_camerons_method.parquet | 3 +++ .../data/single_cell_profiles/Plate_4_sc_annotated.parquet | 3 +++ .../single_cell_profiles/Plate_4_sc_feature_selected.parquet | 3 +++ .../data/single_cell_profiles/Plate_4_sc_normalized.parquet | 3 +++ 45 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 3.processing_features/data/bulk_profiles/Plate_1_bulk.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_1_bulk_annotated.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_1_bulk_feature_selected.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_1_bulk_normalized.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_2_bulk.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_2_bulk_annotated.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_2_bulk_feature_selected.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_2_bulk_normalized.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_3_bulk.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_3_bulk_annotated.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_3_bulk_feature_selected.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_3_bulk_normalized.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_3_prime_bulk.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_3_prime_bulk_annotated.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_3_prime_bulk_feature_selected.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_3_prime_bulk_normalized.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_4_bulk.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_4_bulk_annotated.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_4_bulk_feature_selected.parquet create mode 100644 3.processing_features/data/bulk_profiles/Plate_4_bulk_normalized.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_1_bulk_camerons_method.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_1_sc_annotated.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_1_sc_feature_selected.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_1_sc_normalized.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_2_bulk_camerons_method.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_2_sc_annotated.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_2_sc_feature_selected.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_2_sc_normalized.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_3_bulk_camerons_method.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_3_prime_bulk_camerons_method.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_annotated.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_feature_selected.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_normalized.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_3_sc_annotated.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_3_sc_feature_selected.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_3_sc_normalized.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_4_bulk_camerons_method.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_4_sc_annotated.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_4_sc_feature_selected.parquet create mode 100644 3.processing_features/data/single_cell_profiles/Plate_4_sc_normalized.parquet diff --git a/3.processing_features/data/bulk_profiles/Plate_1_bulk.parquet b/3.processing_features/data/bulk_profiles/Plate_1_bulk.parquet new file mode 100644 index 0000000..201709a --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_1_bulk.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c59430823129262a87d9abeafefe030e991304288a4c39e1257ba6a3d1b18403 +size 1751351 diff --git a/3.processing_features/data/bulk_profiles/Plate_1_bulk_annotated.parquet b/3.processing_features/data/bulk_profiles/Plate_1_bulk_annotated.parquet new file mode 100644 index 0000000..51119f8 --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_1_bulk_annotated.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c842deb42bb21e61686af8658d6ad6dce0c848c7cc245bad19722952bb163d1 +size 1753937 diff --git a/3.processing_features/data/bulk_profiles/Plate_1_bulk_feature_selected.parquet b/3.processing_features/data/bulk_profiles/Plate_1_bulk_feature_selected.parquet new file mode 100644 index 0000000..de86c6d --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_1_bulk_feature_selected.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e04ae390c64bc9c1bf4b24fd00c505071212b337b16b6f6a8095b670e61db7b1 +size 338151 diff --git a/3.processing_features/data/bulk_profiles/Plate_1_bulk_normalized.parquet b/3.processing_features/data/bulk_profiles/Plate_1_bulk_normalized.parquet new file mode 100644 index 0000000..8268d91 --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_1_bulk_normalized.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b286b011d1b3fbbdfd32efed42ee314f5a4e18cb82a76eb17c027047980a7ec +size 1755460 diff --git a/3.processing_features/data/bulk_profiles/Plate_2_bulk.parquet b/3.processing_features/data/bulk_profiles/Plate_2_bulk.parquet new file mode 100644 index 0000000..76f92c0 --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_2_bulk.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:faddab8dd35ac30f9fdd4c6ae9b5e5baceb480d813202dbaa9adbf2660c5fb0f +size 2087172 diff --git a/3.processing_features/data/bulk_profiles/Plate_2_bulk_annotated.parquet b/3.processing_features/data/bulk_profiles/Plate_2_bulk_annotated.parquet new file mode 100644 index 0000000..30a545e --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_2_bulk_annotated.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46bc3bc677b389523fd4c6bbf2177ac298a69b53244c06467df39ef7be1e9d1e +size 2089805 diff --git a/3.processing_features/data/bulk_profiles/Plate_2_bulk_feature_selected.parquet b/3.processing_features/data/bulk_profiles/Plate_2_bulk_feature_selected.parquet new file mode 100644 index 0000000..369dc5a --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_2_bulk_feature_selected.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:679beedbedb7aa010dc91e02d7f16c77a904be3ab55b72b68f3b05e1aee98ae5 +size 1065741 diff --git a/3.processing_features/data/bulk_profiles/Plate_2_bulk_normalized.parquet b/3.processing_features/data/bulk_profiles/Plate_2_bulk_normalized.parquet new file mode 100644 index 0000000..5650b12 --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_2_bulk_normalized.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:368400cbc62f04741f078566421713d0149a9231e85a1ceec1f93fdc86ae8611 +size 2097440 diff --git a/3.processing_features/data/bulk_profiles/Plate_3_bulk.parquet b/3.processing_features/data/bulk_profiles/Plate_3_bulk.parquet new file mode 100644 index 0000000..922caea --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_3_bulk.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6caf95cf379f63827cc45a7929cd3137940e62af61614c61188eb278ff41427b +size 3578872 diff --git a/3.processing_features/data/bulk_profiles/Plate_3_bulk_annotated.parquet b/3.processing_features/data/bulk_profiles/Plate_3_bulk_annotated.parquet new file mode 100644 index 0000000..8d3ecbe --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_3_bulk_annotated.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3942d36d4f3c06efb572ecc8e3acfdff60616cfc4184097da4396519321d1019 +size 3582148 diff --git a/3.processing_features/data/bulk_profiles/Plate_3_bulk_feature_selected.parquet b/3.processing_features/data/bulk_profiles/Plate_3_bulk_feature_selected.parquet new file mode 100644 index 0000000..effb4f5 --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_3_bulk_feature_selected.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f32ad6d23fca5b5ae84c73ed9276119c4a33c157a9f608f3d40ba0b310d08145 +size 1227007 diff --git a/3.processing_features/data/bulk_profiles/Plate_3_bulk_normalized.parquet b/3.processing_features/data/bulk_profiles/Plate_3_bulk_normalized.parquet new file mode 100644 index 0000000..a0e031a --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_3_bulk_normalized.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:801e3d7949a608d28852176b093093892ace66a60ca00759d4cda0238415d414 +size 3604212 diff --git a/3.processing_features/data/bulk_profiles/Plate_3_prime_bulk.parquet b/3.processing_features/data/bulk_profiles/Plate_3_prime_bulk.parquet new file mode 100644 index 0000000..713ac99 --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_3_prime_bulk.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:778ab5fdb13a0ff44a584ded384bffbf38db940336b70d418db93a957969e0d2 +size 3576497 diff --git a/3.processing_features/data/bulk_profiles/Plate_3_prime_bulk_annotated.parquet b/3.processing_features/data/bulk_profiles/Plate_3_prime_bulk_annotated.parquet new file mode 100644 index 0000000..6cd343b --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_3_prime_bulk_annotated.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d586b56ba62249220d4d0f2dac9800fe3e4713c3332d898c7170a3b372e9e5d +size 3579582 diff --git a/3.processing_features/data/bulk_profiles/Plate_3_prime_bulk_feature_selected.parquet b/3.processing_features/data/bulk_profiles/Plate_3_prime_bulk_feature_selected.parquet new file mode 100644 index 0000000..5e3ba68 --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_3_prime_bulk_feature_selected.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61d01673a2c27d5c92fe701576bcde251b483c1c35d826359b601eb6cca9de62 +size 1468900 diff --git a/3.processing_features/data/bulk_profiles/Plate_3_prime_bulk_normalized.parquet b/3.processing_features/data/bulk_profiles/Plate_3_prime_bulk_normalized.parquet new file mode 100644 index 0000000..b1516fb --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_3_prime_bulk_normalized.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd0747acaabab106e80bfe466174808a33bf892f71421a9782f4ca6152cc3c1f +size 3601581 diff --git a/3.processing_features/data/bulk_profiles/Plate_4_bulk.parquet b/3.processing_features/data/bulk_profiles/Plate_4_bulk.parquet new file mode 100644 index 0000000..b109152 --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_4_bulk.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11cfe27774697138fa44b431a848d3f69f70a156f64044e24bcb36301012d6dd +size 3315389 diff --git a/3.processing_features/data/bulk_profiles/Plate_4_bulk_annotated.parquet b/3.processing_features/data/bulk_profiles/Plate_4_bulk_annotated.parquet new file mode 100644 index 0000000..6b57a4f --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_4_bulk_annotated.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a7a70cf92aa8b51be858fc1e87c59883d2a9387b9fbbd92e207c1c5f4d4efd9 +size 3321007 diff --git a/3.processing_features/data/bulk_profiles/Plate_4_bulk_feature_selected.parquet b/3.processing_features/data/bulk_profiles/Plate_4_bulk_feature_selected.parquet new file mode 100644 index 0000000..2b7f04d --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_4_bulk_feature_selected.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a07ec663e9408afbd8b1ad5c142091c4b7e0b686230d7239f59c301a87061df1 +size 1727546 diff --git a/3.processing_features/data/bulk_profiles/Plate_4_bulk_normalized.parquet b/3.processing_features/data/bulk_profiles/Plate_4_bulk_normalized.parquet new file mode 100644 index 0000000..01cd60d --- /dev/null +++ b/3.processing_features/data/bulk_profiles/Plate_4_bulk_normalized.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccbb7c55859d3256ef7880517856c830fb457aa4550f62e3ebdfd71504070f54 +size 3340463 diff --git a/3.processing_features/data/converted_data/Plate_1.parquet b/3.processing_features/data/converted_data/Plate_1.parquet index d69a136..adb32ff 100644 --- a/3.processing_features/data/converted_data/Plate_1.parquet +++ b/3.processing_features/data/converted_data/Plate_1.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:251475ce5eceab969da9a27621dd43ab40fc215d9624c68da00d2e77203935f1 +oid sha256:a8e2ca1624a819a429fc74d521c70cff8ee7b40e407e4d7b81974c42809c4c64 size 5120764 diff --git a/3.processing_features/data/converted_data/Plate_2.parquet b/3.processing_features/data/converted_data/Plate_2.parquet index ad297ec..c30950e 100644 --- a/3.processing_features/data/converted_data/Plate_2.parquet +++ b/3.processing_features/data/converted_data/Plate_2.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d6591514f420beea086340b4f31f755f9cecddf5ea2bbb8c0fc21e816081ebf +oid sha256:9a06c6cf7de7b96e6a69985266f18e8c5a0518667d1ddab9f10fd1dbc2da3aff size 26812914 diff --git a/3.processing_features/data/converted_data/Plate_3.parquet b/3.processing_features/data/converted_data/Plate_3.parquet index 16e1607..036b81f 100644 --- a/3.processing_features/data/converted_data/Plate_3.parquet +++ b/3.processing_features/data/converted_data/Plate_3.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abfb9422e59c131ba9cf70d42e9477bfa9742d79cf557884aabc102c2d5b97f1 +oid sha256:f14cbe1b8a980d91f808ef6e608e54533d3c8515eb10c22a6a8a503c64f459a9 size 379241531 diff --git a/3.processing_features/data/converted_data/Plate_3_prime.parquet b/3.processing_features/data/converted_data/Plate_3_prime.parquet index 64d6cb8..ce7d337 100644 --- a/3.processing_features/data/converted_data/Plate_3_prime.parquet +++ b/3.processing_features/data/converted_data/Plate_3_prime.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e0fdfc22075b85d7dbf634cf272268d9e5478471dd62a0a4fdef6c2e4847643 +oid sha256:2c9f27aa6b5f872af4f75ad2ee3340e4c97850887388163ff7af067139d4bb38 size 294422146 diff --git a/3.processing_features/data/converted_data/Plate_4.parquet b/3.processing_features/data/converted_data/Plate_4.parquet index 14e8d56..45355e6 100644 --- a/3.processing_features/data/converted_data/Plate_4.parquet +++ b/3.processing_features/data/converted_data/Plate_4.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5dc674b3da606f74e306f21944ecadc10090951ec1266ee3220a73b44c81465 +oid sha256:facdc731d5b122f163d08af2cf47a5c1ed9a6edcd2c9c905aae59d6ab88a7df2 size 151568976 diff --git a/3.processing_features/data/single_cell_profiles/Plate_1_bulk_camerons_method.parquet b/3.processing_features/data/single_cell_profiles/Plate_1_bulk_camerons_method.parquet new file mode 100644 index 0000000..61437cc --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_1_bulk_camerons_method.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8a2aa8d79fcf4d9ceb8383e6b70836c82225084144e832b3707c0a8403519c3 +size 863744 diff --git a/3.processing_features/data/single_cell_profiles/Plate_1_sc_annotated.parquet b/3.processing_features/data/single_cell_profiles/Plate_1_sc_annotated.parquet new file mode 100644 index 0000000..e96b595 --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_1_sc_annotated.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67748d761a1743eae0f20d943ca92e6d644bda6ffe7ef36a9c2b8d2c098e4c26 +size 5123338 diff --git a/3.processing_features/data/single_cell_profiles/Plate_1_sc_feature_selected.parquet b/3.processing_features/data/single_cell_profiles/Plate_1_sc_feature_selected.parquet new file mode 100644 index 0000000..6a5947c --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_1_sc_feature_selected.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35ca94d9b86bea698ede7005acaa6d9e02c42411bcff3fdba030e9b0036afe1d +size 2625760 diff --git a/3.processing_features/data/single_cell_profiles/Plate_1_sc_normalized.parquet b/3.processing_features/data/single_cell_profiles/Plate_1_sc_normalized.parquet new file mode 100644 index 0000000..c077a7a --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_1_sc_normalized.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b87ff0b549b3b4dc398fbc6615f4abf90f7920f33b1a5e81b4ccbb4963b5422 +size 5186667 diff --git a/3.processing_features/data/single_cell_profiles/Plate_2_bulk_camerons_method.parquet b/3.processing_features/data/single_cell_profiles/Plate_2_bulk_camerons_method.parquet new file mode 100644 index 0000000..e6db1d1 --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_2_bulk_camerons_method.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90e066b41c09358f4ab6b2e3b08eb9aeeec13d06f9a891d2fddb07fa26a85316 +size 1053693 diff --git a/3.processing_features/data/single_cell_profiles/Plate_2_sc_annotated.parquet b/3.processing_features/data/single_cell_profiles/Plate_2_sc_annotated.parquet new file mode 100644 index 0000000..e777f9b --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_2_sc_annotated.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19f0dc5b68ec6574080854033c602c60b2bec758b83dd974be7105b16fd871af +size 26815268 diff --git a/3.processing_features/data/single_cell_profiles/Plate_2_sc_feature_selected.parquet b/3.processing_features/data/single_cell_profiles/Plate_2_sc_feature_selected.parquet new file mode 100644 index 0000000..3eee9bd --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_2_sc_feature_selected.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a6258c2546d59a5adc647473f2629299e9b2f1b406da3a6c9edb8b329ab92db +size 14213074 diff --git a/3.processing_features/data/single_cell_profiles/Plate_2_sc_normalized.parquet b/3.processing_features/data/single_cell_profiles/Plate_2_sc_normalized.parquet new file mode 100644 index 0000000..adc352d --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_2_sc_normalized.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ce6fb4085f6987ee529a9a3954c767fce18f48e8dc98287769a9898c77389a3 +size 27177722 diff --git a/3.processing_features/data/single_cell_profiles/Plate_3_bulk_camerons_method.parquet b/3.processing_features/data/single_cell_profiles/Plate_3_bulk_camerons_method.parquet new file mode 100644 index 0000000..cea260b --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_3_bulk_camerons_method.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:091b5eab1754db5fc006e3fe7efd9ba2809c39613f86f46c5eed971081e95d09 +size 1869838 diff --git a/3.processing_features/data/single_cell_profiles/Plate_3_prime_bulk_camerons_method.parquet b/3.processing_features/data/single_cell_profiles/Plate_3_prime_bulk_camerons_method.parquet new file mode 100644 index 0000000..9684ea9 --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_3_prime_bulk_camerons_method.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f35d3153032341f8b62054bf1ba05c9aba230329d9bede8c48fdcba623e4bf64 +size 1823967 diff --git a/3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_annotated.parquet b/3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_annotated.parquet new file mode 100644 index 0000000..29d5ca4 --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_annotated.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50f34782dbd0b27ee0068e6a0a2ee4fbfebee6d3858f2b405d5adcb8088efa6a +size 294391555 diff --git a/3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_feature_selected.parquet b/3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_feature_selected.parquet new file mode 100644 index 0000000..f01e219 --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_feature_selected.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1b06210116f15a93ab2680c3e34fb2b7936c82fd960122864fe17c74f046e70 +size 157247682 diff --git a/3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_normalized.parquet b/3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_normalized.parquet new file mode 100644 index 0000000..72a54b7 --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_3_prime_sc_normalized.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1614526183055559b0516c59fae3666e5088757bd3150a420f609be550c5a6bd +size 296819114 diff --git a/3.processing_features/data/single_cell_profiles/Plate_3_sc_annotated.parquet b/3.processing_features/data/single_cell_profiles/Plate_3_sc_annotated.parquet new file mode 100644 index 0000000..9c75179 --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_3_sc_annotated.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53f720d221592ae73c85ddf5fc6bddec8d950f4f0173050b57ac0c0f04421c93 +size 379244379 diff --git a/3.processing_features/data/single_cell_profiles/Plate_3_sc_feature_selected.parquet b/3.processing_features/data/single_cell_profiles/Plate_3_sc_feature_selected.parquet new file mode 100644 index 0000000..d2d4650 --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_3_sc_feature_selected.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6e88f821d87f295726f4ae183b54300e3198b877c60bef5f6f195a76657b1bf +size 208135640 diff --git a/3.processing_features/data/single_cell_profiles/Plate_3_sc_normalized.parquet b/3.processing_features/data/single_cell_profiles/Plate_3_sc_normalized.parquet new file mode 100644 index 0000000..7935324 --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_3_sc_normalized.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e2d8bb7eeee4889ea341e84cb77ebf61fb3ed369dd96c3e0a8e5134eef37ed9 +size 382164469 diff --git a/3.processing_features/data/single_cell_profiles/Plate_4_bulk_camerons_method.parquet b/3.processing_features/data/single_cell_profiles/Plate_4_bulk_camerons_method.parquet new file mode 100644 index 0000000..3470974 --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_4_bulk_camerons_method.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f3506982c91db9e65fdcdfeb0887ebef711e4796922cf6ccc5927eebe0a0696 +size 1736582 diff --git a/3.processing_features/data/single_cell_profiles/Plate_4_sc_annotated.parquet b/3.processing_features/data/single_cell_profiles/Plate_4_sc_annotated.parquet new file mode 100644 index 0000000..2e28afe --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_4_sc_annotated.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4645f81355703b885bc03326057f52090b330768d53139337f2e6d0cd08ec8f +size 151581085 diff --git a/3.processing_features/data/single_cell_profiles/Plate_4_sc_feature_selected.parquet b/3.processing_features/data/single_cell_profiles/Plate_4_sc_feature_selected.parquet new file mode 100644 index 0000000..fd1cdb1 --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_4_sc_feature_selected.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6dfc3e1f4ef0296d7648de30f6c75b7a0f28d82b092f8a237e47430f7bc566f +size 83382641 diff --git a/3.processing_features/data/single_cell_profiles/Plate_4_sc_normalized.parquet b/3.processing_features/data/single_cell_profiles/Plate_4_sc_normalized.parquet new file mode 100644 index 0000000..58a5f47 --- /dev/null +++ b/3.processing_features/data/single_cell_profiles/Plate_4_sc_normalized.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b90066bf1ae7d792a8ab5d36b72a7ef6f669c2e33dfc71349b209f60066ca426 +size 153056439 From 9280b1ac33d465cd7cb9be2a23d6aff9a9d73248 Mon Sep 17 00:00:00 2001 From: gwaybio Date: Fri, 4 Aug 2023 13:13:38 -0600 Subject: [PATCH 07/10] delete other data --- .../data/feature_selected_data/Plate_1_bulk_norm_fs.parquet | 3 --- .../data/feature_selected_data/Plate_1_sc_norm_fs.parquet | 3 --- .../data/feature_selected_data/Plate_2_bulk_norm_fs.parquet | 3 --- .../data/feature_selected_data/Plate_2_sc_norm_fs.parquet | 3 --- .../data/feature_selected_data/Plate_3_bulk_norm_fs.parquet | 3 --- .../feature_selected_data/Plate_3_prime_bulk_norm_fs.parquet | 3 --- .../feature_selected_data/Plate_3_prime_sc_norm_fs.parquet | 3 --- .../data/feature_selected_data/Plate_3_sc_norm_fs.parquet | 3 --- .../data/feature_selected_data/Plate_4_bulk_norm_fs.parquet | 3 --- .../data/feature_selected_data/Plate_4_sc_norm_fs.parquet | 3 --- .../data/normalized_data/Plate_1_bulk_norm.parquet | 3 --- .../data/normalized_data/Plate_1_sc_norm.parquet | 3 --- .../data/normalized_data/Plate_2_bulk_norm.parquet | 3 --- .../data/normalized_data/Plate_2_sc_norm.parquet | 3 --- .../data/normalized_data/Plate_3_bulk_norm.parquet | 3 --- .../data/normalized_data/Plate_3_prime_bulk_norm.parquet | 3 --- .../data/normalized_data/Plate_3_prime_sc_norm.parquet | 3 --- .../data/normalized_data/Plate_3_sc_norm.parquet | 3 --- .../data/normalized_data/Plate_4_bulk_norm.parquet | 3 --- .../data/normalized_data/Plate_4_sc_norm.parquet | 3 --- 20 files changed, 60 deletions(-) delete mode 100644 3.processing_features/data/feature_selected_data/Plate_1_bulk_norm_fs.parquet delete mode 100644 3.processing_features/data/feature_selected_data/Plate_1_sc_norm_fs.parquet delete mode 100644 3.processing_features/data/feature_selected_data/Plate_2_bulk_norm_fs.parquet delete mode 100644 3.processing_features/data/feature_selected_data/Plate_2_sc_norm_fs.parquet delete mode 100644 3.processing_features/data/feature_selected_data/Plate_3_bulk_norm_fs.parquet delete mode 100644 3.processing_features/data/feature_selected_data/Plate_3_prime_bulk_norm_fs.parquet delete mode 100644 3.processing_features/data/feature_selected_data/Plate_3_prime_sc_norm_fs.parquet delete mode 100644 3.processing_features/data/feature_selected_data/Plate_3_sc_norm_fs.parquet delete mode 100644 3.processing_features/data/feature_selected_data/Plate_4_bulk_norm_fs.parquet delete mode 100644 3.processing_features/data/feature_selected_data/Plate_4_sc_norm_fs.parquet delete mode 100644 3.processing_features/data/normalized_data/Plate_1_bulk_norm.parquet delete mode 100644 3.processing_features/data/normalized_data/Plate_1_sc_norm.parquet delete mode 100644 3.processing_features/data/normalized_data/Plate_2_bulk_norm.parquet delete mode 100644 3.processing_features/data/normalized_data/Plate_2_sc_norm.parquet delete mode 100644 3.processing_features/data/normalized_data/Plate_3_bulk_norm.parquet delete mode 100644 3.processing_features/data/normalized_data/Plate_3_prime_bulk_norm.parquet delete mode 100644 3.processing_features/data/normalized_data/Plate_3_prime_sc_norm.parquet delete mode 100644 3.processing_features/data/normalized_data/Plate_3_sc_norm.parquet delete mode 100644 3.processing_features/data/normalized_data/Plate_4_bulk_norm.parquet delete mode 100644 3.processing_features/data/normalized_data/Plate_4_sc_norm.parquet diff --git a/3.processing_features/data/feature_selected_data/Plate_1_bulk_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_1_bulk_norm_fs.parquet deleted file mode 100644 index 5d360a5..0000000 --- a/3.processing_features/data/feature_selected_data/Plate_1_bulk_norm_fs.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ea5b23c17031746aee15165c5cbf56eb0dd8604bb47390a11324a60f527b8246 -size 338151 diff --git a/3.processing_features/data/feature_selected_data/Plate_1_sc_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_1_sc_norm_fs.parquet deleted file mode 100644 index 5f06654..0000000 --- a/3.processing_features/data/feature_selected_data/Plate_1_sc_norm_fs.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7358718240776075d2220a6f645dd24e2d98e72a76c65dcf5ec1d142050813a9 -size 2620756 diff --git a/3.processing_features/data/feature_selected_data/Plate_2_bulk_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_2_bulk_norm_fs.parquet deleted file mode 100644 index e528bfa..0000000 --- a/3.processing_features/data/feature_selected_data/Plate_2_bulk_norm_fs.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d352aa69c2c5d7fa430b5fc66ecaebeb122b6e4c44f79e56f9a50327e6480c27 -size 1051248 diff --git a/3.processing_features/data/feature_selected_data/Plate_2_sc_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_2_sc_norm_fs.parquet deleted file mode 100644 index c41dfa8..0000000 --- a/3.processing_features/data/feature_selected_data/Plate_2_sc_norm_fs.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:71e3670b329e5b37449f7e128271eb31ad24986ad618807edcedabc97183df42 -size 14218043 diff --git a/3.processing_features/data/feature_selected_data/Plate_3_bulk_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_3_bulk_norm_fs.parquet deleted file mode 100644 index 6ecbe77..0000000 --- a/3.processing_features/data/feature_selected_data/Plate_3_bulk_norm_fs.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c0cbc7ac2114e21e6a20d6d39d147e60f574d7add6c2a5f28878cc5d685ca0d8 -size 1210260 diff --git a/3.processing_features/data/feature_selected_data/Plate_3_prime_bulk_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_3_prime_bulk_norm_fs.parquet deleted file mode 100644 index 2b3d6b3..0000000 --- a/3.processing_features/data/feature_selected_data/Plate_3_prime_bulk_norm_fs.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d5b055d96b0a222bdf71dfb1f422fc36e8217ecb036d541d67bd882fd27daa5f -size 1453252 diff --git a/3.processing_features/data/feature_selected_data/Plate_3_prime_sc_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_3_prime_sc_norm_fs.parquet deleted file mode 100644 index 0454060..0000000 --- a/3.processing_features/data/feature_selected_data/Plate_3_prime_sc_norm_fs.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c5f5c5642812769a61407f409c1ca38d4e2b6eb95992394811f58bbaa0b2dad3 -size 157740866 diff --git a/3.processing_features/data/feature_selected_data/Plate_3_sc_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_3_sc_norm_fs.parquet deleted file mode 100644 index c2e608f..0000000 --- a/3.processing_features/data/feature_selected_data/Plate_3_sc_norm_fs.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:225302e2ef66a42d48e72dcdf52f2b1353ba8c6deb75fa05aff220d8138cd8c7 -size 208615874 diff --git a/3.processing_features/data/feature_selected_data/Plate_4_bulk_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_4_bulk_norm_fs.parquet deleted file mode 100644 index 5f19702..0000000 --- a/3.processing_features/data/feature_selected_data/Plate_4_bulk_norm_fs.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ed0849551b2f42524610523c24a9b3a6e6522c683a6ca9151aa76caf6feed1b4 -size 1716574 diff --git a/3.processing_features/data/feature_selected_data/Plate_4_sc_norm_fs.parquet b/3.processing_features/data/feature_selected_data/Plate_4_sc_norm_fs.parquet deleted file mode 100644 index 7cd2ead..0000000 --- a/3.processing_features/data/feature_selected_data/Plate_4_sc_norm_fs.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:438aa8c53b98e0dbced08dc6461481353dc0cbaa12c361c8f39e349eed7694c4 -size 83588263 diff --git a/3.processing_features/data/normalized_data/Plate_1_bulk_norm.parquet b/3.processing_features/data/normalized_data/Plate_1_bulk_norm.parquet deleted file mode 100644 index baa314b..0000000 --- a/3.processing_features/data/normalized_data/Plate_1_bulk_norm.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b480cff8683e17b5cf733ab51a329950b97cc4164c407652ea3de8f76685cb95 -size 1755460 diff --git a/3.processing_features/data/normalized_data/Plate_1_sc_norm.parquet b/3.processing_features/data/normalized_data/Plate_1_sc_norm.parquet deleted file mode 100644 index 716c0f1..0000000 --- a/3.processing_features/data/normalized_data/Plate_1_sc_norm.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5cf500bd1b7ac6bd907b217695fa6cb7043990b6628584dad8912f3aea40421b -size 5186667 diff --git a/3.processing_features/data/normalized_data/Plate_2_bulk_norm.parquet b/3.processing_features/data/normalized_data/Plate_2_bulk_norm.parquet deleted file mode 100644 index 88eee73..0000000 --- a/3.processing_features/data/normalized_data/Plate_2_bulk_norm.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e3e4e2ea598ae44fc66e769b768f56972576761f6be720bd832a34d2b62e6623 -size 2097440 diff --git a/3.processing_features/data/normalized_data/Plate_2_sc_norm.parquet b/3.processing_features/data/normalized_data/Plate_2_sc_norm.parquet deleted file mode 100644 index db35c4e..0000000 --- a/3.processing_features/data/normalized_data/Plate_2_sc_norm.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d6b90668f39735a4f28bb477171634252641b80fb14f600bed43e319a9c1412a -size 27177722 diff --git a/3.processing_features/data/normalized_data/Plate_3_bulk_norm.parquet b/3.processing_features/data/normalized_data/Plate_3_bulk_norm.parquet deleted file mode 100644 index 9c3f84c..0000000 --- a/3.processing_features/data/normalized_data/Plate_3_bulk_norm.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f9249b82321d296a13a7dbd4a1c1fb673e4d92b88382b939ec03a440289d332e -size 3604212 diff --git a/3.processing_features/data/normalized_data/Plate_3_prime_bulk_norm.parquet b/3.processing_features/data/normalized_data/Plate_3_prime_bulk_norm.parquet deleted file mode 100644 index 059cbfb..0000000 --- a/3.processing_features/data/normalized_data/Plate_3_prime_bulk_norm.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3c24ba0a63dabccd39d0ec8ce50976285bd7641a22db1129c7ef83ac840433ed -size 3601581 diff --git a/3.processing_features/data/normalized_data/Plate_3_prime_sc_norm.parquet b/3.processing_features/data/normalized_data/Plate_3_prime_sc_norm.parquet deleted file mode 100644 index 6527f4b..0000000 --- a/3.processing_features/data/normalized_data/Plate_3_prime_sc_norm.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3a7c192e09ccc342c08de56258470c2e310175b4bad3df0f6f579bcb49d2c156 -size 296819114 diff --git a/3.processing_features/data/normalized_data/Plate_3_sc_norm.parquet b/3.processing_features/data/normalized_data/Plate_3_sc_norm.parquet deleted file mode 100644 index f7a99b6..0000000 --- a/3.processing_features/data/normalized_data/Plate_3_sc_norm.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7ce08fb9a956e8bae1a16920b6d6bfe79da1b637db0672d0ab3e4415cb052a1 -size 382164469 diff --git a/3.processing_features/data/normalized_data/Plate_4_bulk_norm.parquet b/3.processing_features/data/normalized_data/Plate_4_bulk_norm.parquet deleted file mode 100644 index e75e075..0000000 --- a/3.processing_features/data/normalized_data/Plate_4_bulk_norm.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d6d746750bcdf1231ebc30f294062a102ce55033dca583e0261ef4b869839a0f -size 3340463 diff --git a/3.processing_features/data/normalized_data/Plate_4_sc_norm.parquet b/3.processing_features/data/normalized_data/Plate_4_sc_norm.parquet deleted file mode 100644 index 8efccd2..0000000 --- a/3.processing_features/data/normalized_data/Plate_4_sc_norm.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2f5574185e66dd1bddd3ac6a2eefcfc3f97353d9a49e2e806a04398a5f343458 -size 153056439 From 7b2e0bc525e28d57fe879a8206d1e2a130dda8be Mon Sep 17 00:00:00 2001 From: gwaybio Date: Fri, 4 Aug 2023 13:14:07 -0600 Subject: [PATCH 08/10] redo pycytominer pipelines in one notebook --- .../1.pycytominer_bulk_pipelines.ipynb | 389 + .../2.pycytominer_singlecell_pipelines.ipynb | 247 + .../scripts/1.pycytominer_bulk_pipelines.py | 134 + .../2.pycytominer_singlecell_pipelines.py | 157 + .../scripts/html/0.merge_sc_cytotable.html | 15168 ++++++++++++++++ .../html/1.pycytominer_bulk_pipelines.html | 14978 +++++++++++++++ .../2.pycytominer_singlecell_pipelines.html | 14867 +++++++++++++++ 7 files changed, 45940 insertions(+) create mode 100644 3.processing_features/1.pycytominer_bulk_pipelines.ipynb create mode 100644 3.processing_features/2.pycytominer_singlecell_pipelines.ipynb create mode 100644 3.processing_features/scripts/1.pycytominer_bulk_pipelines.py create mode 100644 3.processing_features/scripts/2.pycytominer_singlecell_pipelines.py create mode 100644 3.processing_features/scripts/html/0.merge_sc_cytotable.html create mode 100644 3.processing_features/scripts/html/1.pycytominer_bulk_pipelines.html create mode 100644 3.processing_features/scripts/html/2.pycytominer_singlecell_pipelines.html diff --git a/3.processing_features/1.pycytominer_bulk_pipelines.ipynb b/3.processing_features/1.pycytominer_bulk_pipelines.ipynb new file mode 100644 index 0000000..91c67c3 --- /dev/null +++ b/3.processing_features/1.pycytominer_bulk_pipelines.ipynb @@ -0,0 +1,389 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "91a846b5-8ead-4ad8-b291-00c09a09ab0a", + "metadata": {}, + "source": [ + "## Perform traditional bulk pycytominer pipeline\n", + "\n", + "Following single-cell curation with cytotable, we create bulk profiles by applying the following steps:\n", + "\n", + "1. aggregation\n", + "2. annotation\n", + "3. normalization\n", + "4. feature_selection" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4b02e669-6b8b-49d7-a0a0-678eff76a854", + "metadata": {}, + "outputs": [], + "source": [ + "import pathlib\n", + "import yaml\n", + "import pprint\n", + "\n", + "import pandas as pd\n", + "\n", + "from pycytominer import aggregate, annotate, normalize, feature_select\n", + "from pycytominer.cyto_utils import load_profiles" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "609183fc-b780-4731-9a63-cbfb448d4f66", + "metadata": {}, + "outputs": [], + "source": [ + "# Set constants\n", + "feature_select_ops = [\n", + " \"variance_threshold\",\n", + " \"correlation_threshold\",\n", + " \"blocklist\",\n", + "]\n", + "\n", + "# Set paths\n", + "output_dir = pathlib.Path(\"data/bulk_profiles\")\n", + "output_dir.mkdir(exist_ok=True)\n", + "metadata_dir = pathlib.Path(\"../0.download_data/metadata/\")\n", + "\n", + "# load in plate information\n", + "dictionary_path = pathlib.Path(\"./plate_info_dictionary.yaml\")\n", + "with open(dictionary_path) as file:\n", + " plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "88811904-baa5-47a4-846b-d733daef1952", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{ 'Plate_1': { 'dest_path': 'data/converted_data/Plate_1.parquet',\n", + " 'platemap_path': '/home/gway/repos/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate1.csv',\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},\n", + " 'Plate_2': { 'dest_path': 'data/converted_data/Plate_2.parquet',\n", + " 'platemap_path': '/home/gway/repos/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate2.csv',\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},\n", + " 'Plate_3': { 'dest_path': 'data/converted_data/Plate_3.parquet',\n", + " 'platemap_path': '/home/gway/repos/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},\n", + " 'Plate_3_prime': { 'dest_path': 'data/converted_data/Plate_3_prime.parquet',\n", + " 'platemap_path': '/home/gway/repos/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},\n", + " 'Plate_4': { 'dest_path': 'data/converted_data/Plate_4.parquet',\n", + " 'platemap_path': '/home/gway/repos/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate4.csv',\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite'}}\n" + ] + } + ], + "source": [ + "# add path to platemaps for each plate \n", + "for plate in plate_info_dictionary.keys():\n", + " # since Plate_3_prime has the same platemap as Plate_3,\n", + " # we need an else statement so that we make sure it adds the \n", + " # path that was given to Plate_3\n", + " if plate != \"Plate_3_prime\":\n", + " # match the naming format of the plates to the platemap file\n", + " plate_info_dictionary[plate][\"platemap_path\"] = str(\n", + " pathlib.Path(\n", + " list(\n", + " metadata_dir.rglob(f\"platemap_NF1_{plate.replace('_', '').lower()}.csv\")\n", + " )[0]\n", + " ).resolve(strict=True)\n", + " )\n", + " else:\n", + " plate_info_dictionary[\"Plate_3_prime\"][\"platemap_path\"] = (\n", + " plate_info_dictionary[\"Plate_3\"][\"platemap_path\"]\n", + " )\n", + "\n", + "# view the dictionary to assess that all info is added correctly\n", + "pprint.pprint(plate_info_dictionary, indent=4)" + ] + }, + { + "cell_type": "markdown", + "id": "9817b380-8fd5-464e-84d9-29505ae7fd41", + "metadata": {}, + "source": [ + "## Perform pycytominer pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "cc03cbc9-3c0d-4251-a66c-a82a2f10c166", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Now performing pycytominer pipeline for Plate_1\n", + "Now performing pycytominer pipeline for Plate_2\n", + "Now performing pycytominer pipeline for Plate_3\n", + "Now performing pycytominer pipeline for Plate_3_prime\n", + "Now performing pycytominer pipeline for Plate_4\n" + ] + } + ], + "source": [ + "for plate, info in plate_info_dictionary.items():\n", + " print(f\"Now performing pycytominer pipeline for {plate}\")\n", + " output_aggregated_file = str(pathlib.Path(f\"{output_dir}/{plate}_bulk.parquet\"))\n", + " output_annotated_file = str(pathlib.Path(f\"{output_dir}/{plate}_bulk_annotated.parquet\"))\n", + " output_normalized_file = str(pathlib.Path(f\"{output_dir}/{plate}_bulk_normalized.parquet\"))\n", + " output_feature_select_file = str(pathlib.Path(f\"{output_dir}/{plate}_bulk_feature_selected.parquet\"))\n", + " \n", + " # Load single-cell profiles\n", + " single_cell_df = pd.read_parquet(info[\"dest_path\"])\n", + " \n", + " # Load platemap\n", + " platemap_df = pd.read_csv(info[\"platemap_path\"])\n", + "\n", + " # Step 1: Aggregation\n", + " aggregate(\n", + " population_df=single_cell_df,\n", + " operation=\"median\",\n", + " strata=[\"Image_Metadata_Plate\", \"Image_Metadata_Well\"],\n", + " output_file=output_aggregated_file,\n", + " output_type=\"parquet\"\n", + " )\n", + " \n", + " # Step 2: Annotation\n", + " annotate(\n", + " profiles=output_aggregated_file,\n", + " platemap=platemap_df,\n", + " join_on=[\"Metadata_well_position\", \"Image_Metadata_Well\"],\n", + " output_file=output_annotated_file,\n", + " output_type=\"parquet\",\n", + " )\n", + " \n", + " # Step 3: Normalization\n", + " normalized_df = normalize(\n", + " profiles=output_annotated_file,\n", + " method=\"standardize\",\n", + " output_file=output_normalized_file,\n", + " output_type=\"parquet\",\n", + " )\n", + " \n", + " # Step 4: Feature selection\n", + " feature_select(\n", + " output_normalized_file,\n", + " operation=feature_select_ops,\n", + " output_file=output_feature_select_file,\n", + " output_type=\"parquet\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4979678b-d0a0-40f8-b3e3-3ec5734ad8a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(60, 1174)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Metadata_WellRowMetadata_WellColMetadata_gene_nameMetadata_genotypeMetadata_seed_densityMetadata_siRNAMetadata_RNAiMaxMetadata_ConcentrationMetadata_PlateMetadata_Well...Nuclei_Texture_InverseDifferenceMoment_DAPI_3_00_256Nuclei_Texture_InverseDifferenceMoment_DAPI_3_01_256Nuclei_Texture_InverseDifferenceMoment_DAPI_3_02_256Nuclei_Texture_InverseDifferenceMoment_GFP_3_02_256Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256Nuclei_Texture_SumEntropy_DAPI_3_02_256Nuclei_Texture_SumVariance_DAPI_3_01_256Nuclei_Texture_Variance_RFP_3_01_256
0B2NF1WT1000None00.00Plate_4B2...0.041461-0.481716-0.590904-0.538406-1.757806-2.256964-2.035733-1.120269-0.0978432.515517
1B3NF1WT1000Scramble10.05Plate_4B3...-0.939900-1.382558-0.9312050.7185460.022127-0.971826-0.2853591.4787160.3875650.906714
\n", + "

2 rows × 1174 columns

\n", + "
" + ], + "text/plain": [ + " Metadata_WellRow Metadata_WellCol Metadata_gene_name Metadata_genotype \\\n", + "0 B 2 NF1 WT \n", + "1 B 3 NF1 WT \n", + "\n", + " Metadata_seed_density Metadata_siRNA Metadata_RNAiMax \\\n", + "0 1000 None 0 \n", + "1 1000 Scramble 1 \n", + "\n", + " Metadata_Concentration Metadata_Plate Metadata_Well ... \\\n", + "0 0.00 Plate_4 B2 ... \n", + "1 0.05 Plate_4 B3 ... \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_DAPI_3_00_256 \\\n", + "0 0.041461 \n", + "1 -0.939900 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_DAPI_3_01_256 \\\n", + "0 -0.481716 \n", + "1 -1.382558 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_DAPI_3_02_256 \\\n", + "0 -0.590904 \n", + "1 -0.931205 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_GFP_3_02_256 \\\n", + "0 -0.538406 \n", + "1 0.718546 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256 \\\n", + "0 -1.757806 \n", + "1 0.022127 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256 \\\n", + "0 -2.256964 \n", + "1 -0.971826 \n", + "\n", + " Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256 \\\n", + "0 -2.035733 \n", + "1 -0.285359 \n", + "\n", + " Nuclei_Texture_SumEntropy_DAPI_3_02_256 \\\n", + "0 -1.120269 \n", + "1 1.478716 \n", + "\n", + " Nuclei_Texture_SumVariance_DAPI_3_01_256 \\\n", + "0 -0.097843 \n", + "1 0.387565 \n", + "\n", + " Nuclei_Texture_Variance_RFP_3_01_256 \n", + "0 2.515517 \n", + "1 0.906714 \n", + "\n", + "[2 rows x 1174 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check output file\n", + "test_df = load_profiles(output_feature_select_file)\n", + "\n", + "print(test_df.shape)\n", + "test_df.head(2)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/3.processing_features/2.pycytominer_singlecell_pipelines.ipynb b/3.processing_features/2.pycytominer_singlecell_pipelines.ipynb new file mode 100644 index 0000000..e20489f --- /dev/null +++ b/3.processing_features/2.pycytominer_singlecell_pipelines.ipynb @@ -0,0 +1,247 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "91a846b5-8ead-4ad8-b291-00c09a09ab0a", + "metadata": {}, + "source": [ + "## Perform single-cell pycytominer pipelines\n", + "\n", + "Following single-cell curation with cytotable, we create single-cell profiles by applying the following steps:\n", + "\n", + "1. annotation\n", + "2. normalization\n", + "3. feature_selection\n", + "\n", + "Additionally, we create bulk profiles following feature selection.\n", + "We call this \"Cameron's Method\".\n", + "\n", + "4. Aggregate (to form bulk, after single-cell processing)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4b02e669-6b8b-49d7-a0a0-678eff76a854", + "metadata": {}, + "outputs": [], + "source": [ + "import pathlib\n", + "import yaml\n", + "import pprint\n", + "\n", + "import pandas as pd\n", + "\n", + "from pycytominer import aggregate, annotate, normalize, feature_select\n", + "from pycytominer.cyto_utils import load_profiles, output, infer_cp_features" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "609183fc-b780-4731-9a63-cbfb448d4f66", + "metadata": {}, + "outputs": [], + "source": [ + "# Set constants\n", + "feature_select_ops = [\n", + " \"variance_threshold\",\n", + " \"correlation_threshold\",\n", + " \"blocklist\",\n", + "]\n", + "\n", + "# Columns to remove prior to single-cell aggregation via cameron's method\n", + "cameron_unwanted_aggregate_cols = {\"Object\", \"Parent\", \"Site\", \"Image\"}\n", + "\n", + "# Set paths\n", + "output_dir = pathlib.Path(\"data/single_cell_profiles\")\n", + "output_dir.mkdir(exist_ok=True)\n", + "metadata_dir = pathlib.Path(\"../0.download_data/metadata/\")\n", + "\n", + "# load in plate information\n", + "dictionary_path = pathlib.Path(\"./plate_info_dictionary.yaml\")\n", + "with open(dictionary_path) as file:\n", + " plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "88811904-baa5-47a4-846b-d733daef1952", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{ 'Plate_1': { 'dest_path': 'data/converted_data/Plate_1.parquet',\n", + " 'platemap_path': '/home/gway/repos/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate1.csv',\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite'},\n", + " 'Plate_2': { 'dest_path': 'data/converted_data/Plate_2.parquet',\n", + " 'platemap_path': '/home/gway/repos/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate2.csv',\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite'},\n", + " 'Plate_3': { 'dest_path': 'data/converted_data/Plate_3.parquet',\n", + " 'platemap_path': '/home/gway/repos/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},\n", + " 'Plate_3_prime': { 'dest_path': 'data/converted_data/Plate_3_prime.parquet',\n", + " 'platemap_path': '/home/gway/repos/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv',\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},\n", + " 'Plate_4': { 'dest_path': 'data/converted_data/Plate_4.parquet',\n", + " 'platemap_path': '/home/gway/repos/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate4.csv',\n", + " 'source_path': '/home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite'}}\n" + ] + } + ], + "source": [ + "# add path to platemaps for each plate \n", + "for plate in plate_info_dictionary.keys():\n", + " # since Plate_3_prime has the same platemap as Plate_3,\n", + " # we need an else statement so that we make sure it adds the \n", + " # path that was given to Plate_3\n", + " if plate != \"Plate_3_prime\":\n", + " # match the naming format of the plates to the platemap file\n", + " plate_info_dictionary[plate][\"platemap_path\"] = str(\n", + " pathlib.Path(\n", + " list(\n", + " metadata_dir.rglob(f\"platemap_NF1_{plate.replace('_', '').lower()}.csv\")\n", + " )[0]\n", + " ).resolve(strict=True)\n", + " )\n", + " else:\n", + " plate_info_dictionary[\"Plate_3_prime\"][\"platemap_path\"] = (\n", + " plate_info_dictionary[\"Plate_3\"][\"platemap_path\"]\n", + " )\n", + "\n", + "# view the dictionary to assess that all info is added correctly\n", + "pprint.pprint(plate_info_dictionary, indent=4)" + ] + }, + { + "cell_type": "markdown", + "id": "9817b380-8fd5-464e-84d9-29505ae7fd41", + "metadata": {}, + "source": [ + "## Perform single-cell pycytominer pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "cc03cbc9-3c0d-4251-a66c-a82a2f10c166", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Now performing single-cell pycytominer pipeline for Plate_1\n", + "(8, 843)\n", + "Now performing single-cell pycytominer pipeline for Plate_2\n", + "(32, 850)\n", + "Now performing single-cell pycytominer pipeline for Plate_3\n", + "(72, 1160)\n", + "Now performing single-cell pycytominer pipeline for Plate_3_prime\n", + "(72, 1131)\n", + "Now performing single-cell pycytominer pipeline for Plate_4\n", + "(60, 1164)\n" + ] + } + ], + "source": [ + "for plate, info in plate_info_dictionary.items():\n", + " print(f\"Now performing single-cell pycytominer pipeline for {plate}\")\n", + " output_annotated_file = str(pathlib.Path(f\"{output_dir}/{plate}_sc_annotated.parquet\"))\n", + " output_normalized_file = str(pathlib.Path(f\"{output_dir}/{plate}_sc_normalized.parquet\"))\n", + " output_feature_select_file = str(pathlib.Path(f\"{output_dir}/{plate}_sc_feature_selected.parquet\"))\n", + " output_feature_select_file = str(pathlib.Path(f\"{output_dir}/{plate}_sc_feature_selected.parquet\"))\n", + " output_aggregated_file = str(pathlib.Path(f\"{output_dir}/{plate}_bulk_camerons_method.parquet\"))\n", + "\n", + " # Load single-cell profiles\n", + " single_cell_df = pd.read_parquet(info[\"dest_path\"])\n", + " \n", + " # Load platemap\n", + " platemap_df = pd.read_csv(info[\"platemap_path\"])\n", + " \n", + " # Step 1: Annotation\n", + " # add metadata from platemap file to extracted single cell features\n", + " annotated_df = annotate(\n", + " profiles=single_cell_df,\n", + " platemap=platemap_df,\n", + " join_on=[\"Metadata_well_position\", \"Image_Metadata_Well\"],\n", + " )\n", + "\n", + " # rename site column to avoid any issues with identifying the column as metadata over feature\n", + " annotated_df = annotated_df.rename(columns={\"Image_Metadata_Site\": \"Metadata_Site\"})\n", + "\n", + " # move metadata well, single cell count, and site to the front of the df (for easy visualization in python)\n", + " well_column = annotated_df.pop(\"Metadata_Well\")\n", + " singlecell_column = annotated_df.pop(\"Metadata_number_of_singlecells\")\n", + " site_column = annotated_df.pop(\"Metadata_Site\") \n", + "\n", + " # insert the columns in specific parts of the dataframe\n", + " annotated_df.insert(2, \"Metadata_Well\", well_column)\n", + " annotated_df.insert(3, \"Metadata_Site\", site_column)\n", + " annotated_df.insert(4, \"Metadata_number_of_singlecells\", singlecell_column)\n", + "\n", + " # save annotated df as parquet file\n", + " output(\n", + " df=annotated_df,\n", + " output_filename=output_annotated_file,\n", + " output_type=\"parquet\",\n", + " )\n", + " \n", + " # Step 2: Normalization\n", + " normalized_df = normalize(\n", + " profiles=output_annotated_file,\n", + " method=\"standardize\",\n", + " output_file=output_normalized_file,\n", + " output_type=\"parquet\",\n", + " )\n", + " \n", + " # Step 3: Feature selection\n", + " feature_select(\n", + " output_normalized_file,\n", + " operation=feature_select_ops,\n", + " output_file=output_feature_select_file,\n", + " output_type=\"parquet\"\n", + " )\n", + "\n", + " # Step 4: Cameron's method of aggregation\n", + " feature_select_df = load_profiles(output_feature_select_file)\n", + " metadata_cols = infer_cp_features(feature_select_df, metadata=True)\n", + " metadata_cols = [x for x in metadata_cols if all(col not in x for col in cameron_unwanted_aggregate_cols)]\n", + " \n", + " aggregate_df = aggregate(\n", + " population_df=feature_select_df,\n", + " operation=\"median\",\n", + " strata=metadata_cols,\n", + " output_file=output_aggregated_file,\n", + " output_type=\"parquet\"\n", + " )\n", + "\n", + " print(aggregate_df.shape)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/3.processing_features/scripts/1.pycytominer_bulk_pipelines.py b/3.processing_features/scripts/1.pycytominer_bulk_pipelines.py new file mode 100644 index 0000000..563e5b4 --- /dev/null +++ b/3.processing_features/scripts/1.pycytominer_bulk_pipelines.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +# coding: utf-8 + +# ## Perform traditional bulk pycytominer pipeline +# +# Following single-cell curation with cytotable, we create bulk profiles by applying the following steps: +# +# 1. aggregation +# 2. annotation +# 3. normalization +# 4. feature_selection + +# In[1]: + + +import pathlib +import yaml +import pprint + +import pandas as pd + +from pycytominer import aggregate, annotate, normalize, feature_select +from pycytominer.cyto_utils import load_profiles + + +# In[2]: + + +# Set constants +feature_select_ops = [ + "variance_threshold", + "correlation_threshold", + "blocklist", +] + +# Set paths +output_dir = pathlib.Path("data/bulk_profiles") +output_dir.mkdir(exist_ok=True) +metadata_dir = pathlib.Path("../0.download_data/metadata/") + +# load in plate information +dictionary_path = pathlib.Path("./plate_info_dictionary.yaml") +with open(dictionary_path) as file: + plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader) + + +# In[3]: + + +# add path to platemaps for each plate +for plate in plate_info_dictionary.keys(): + # since Plate_3_prime has the same platemap as Plate_3, + # we need an else statement so that we make sure it adds the + # path that was given to Plate_3 + if plate != "Plate_3_prime": + # match the naming format of the plates to the platemap file + plate_info_dictionary[plate]["platemap_path"] = str( + pathlib.Path( + list( + metadata_dir.rglob(f"platemap_NF1_{plate.replace('_', '').lower()}.csv") + )[0] + ).resolve(strict=True) + ) + else: + plate_info_dictionary["Plate_3_prime"]["platemap_path"] = ( + plate_info_dictionary["Plate_3"]["platemap_path"] + ) + +# view the dictionary to assess that all info is added correctly +pprint.pprint(plate_info_dictionary, indent=4) + + +# ## Perform pycytominer pipeline + +# In[4]: + + +for plate, info in plate_info_dictionary.items(): + print(f"Now performing pycytominer pipeline for {plate}") + output_aggregated_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk.parquet")) + output_annotated_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk_annotated.parquet")) + output_normalized_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk_normalized.parquet")) + output_feature_select_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk_feature_selected.parquet")) + + # Load single-cell profiles + single_cell_df = pd.read_parquet(info["dest_path"]) + + # Load platemap + platemap_df = pd.read_csv(info["platemap_path"]) + + # Step 1: Aggregation + aggregate( + population_df=single_cell_df, + operation="median", + strata=["Image_Metadata_Plate", "Image_Metadata_Well"], + output_file=output_aggregated_file, + output_type="parquet" + ) + + # Step 2: Annotation + annotate( + profiles=output_aggregated_file, + platemap=platemap_df, + join_on=["Metadata_well_position", "Image_Metadata_Well"], + output_file=output_annotated_file, + output_type="parquet", + ) + + # Step 3: Normalization + normalized_df = normalize( + profiles=output_annotated_file, + method="standardize", + output_file=output_normalized_file, + output_type="parquet", + ) + + # Step 4: Feature selection + feature_select( + output_normalized_file, + operation=feature_select_ops, + output_file=output_feature_select_file, + output_type="parquet" + ) + + +# In[5]: + + +# Check output file +test_df = load_profiles(output_feature_select_file) + +print(test_df.shape) +test_df.head(2) + diff --git a/3.processing_features/scripts/2.pycytominer_singlecell_pipelines.py b/3.processing_features/scripts/2.pycytominer_singlecell_pipelines.py new file mode 100644 index 0000000..a847796 --- /dev/null +++ b/3.processing_features/scripts/2.pycytominer_singlecell_pipelines.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python +# coding: utf-8 + +# ## Perform single-cell pycytominer pipelines +# +# Following single-cell curation with cytotable, we create single-cell profiles by applying the following steps: +# +# 1. annotation +# 2. normalization +# 3. feature_selection +# +# Additionally, we create bulk profiles following feature selection. +# We call this "Cameron's Method". +# +# 4. Aggregate (to form bulk, after single-cell processing) + +# In[1]: + + +import pathlib +import yaml +import pprint + +import pandas as pd + +from pycytominer import aggregate, annotate, normalize, feature_select +from pycytominer.cyto_utils import load_profiles, output, infer_cp_features + + +# In[2]: + + +# Set constants +feature_select_ops = [ + "variance_threshold", + "correlation_threshold", + "blocklist", +] + +# Columns to remove prior to single-cell aggregation via cameron's method +cameron_unwanted_aggregate_cols = {"Object", "Parent", "Site", "Image"} + +# Set paths +output_dir = pathlib.Path("data/single_cell_profiles") +output_dir.mkdir(exist_ok=True) +metadata_dir = pathlib.Path("../0.download_data/metadata/") + +# load in plate information +dictionary_path = pathlib.Path("./plate_info_dictionary.yaml") +with open(dictionary_path) as file: + plate_info_dictionary = yaml.load(file, Loader=yaml.FullLoader) + + +# In[3]: + + +# add path to platemaps for each plate +for plate in plate_info_dictionary.keys(): + # since Plate_3_prime has the same platemap as Plate_3, + # we need an else statement so that we make sure it adds the + # path that was given to Plate_3 + if plate != "Plate_3_prime": + # match the naming format of the plates to the platemap file + plate_info_dictionary[plate]["platemap_path"] = str( + pathlib.Path( + list( + metadata_dir.rglob(f"platemap_NF1_{plate.replace('_', '').lower()}.csv") + )[0] + ).resolve(strict=True) + ) + else: + plate_info_dictionary["Plate_3_prime"]["platemap_path"] = ( + plate_info_dictionary["Plate_3"]["platemap_path"] + ) + +# view the dictionary to assess that all info is added correctly +pprint.pprint(plate_info_dictionary, indent=4) + + +# ## Perform single-cell pycytominer pipeline + +# In[4]: + + +for plate, info in plate_info_dictionary.items(): + print(f"Now performing single-cell pycytominer pipeline for {plate}") + output_annotated_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_annotated.parquet")) + output_normalized_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_normalized.parquet")) + output_feature_select_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_feature_selected.parquet")) + output_feature_select_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_feature_selected.parquet")) + output_aggregated_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk_camerons_method.parquet")) + + # Load single-cell profiles + single_cell_df = pd.read_parquet(info["dest_path"]) + + # Load platemap + platemap_df = pd.read_csv(info["platemap_path"]) + + # Step 1: Annotation + # add metadata from platemap file to extracted single cell features + annotated_df = annotate( + profiles=single_cell_df, + platemap=platemap_df, + join_on=["Metadata_well_position", "Image_Metadata_Well"], + ) + + # rename site column to avoid any issues with identifying the column as metadata over feature + annotated_df = annotated_df.rename(columns={"Image_Metadata_Site": "Metadata_Site"}) + + # move metadata well, single cell count, and site to the front of the df (for easy visualization in python) + well_column = annotated_df.pop("Metadata_Well") + singlecell_column = annotated_df.pop("Metadata_number_of_singlecells") + site_column = annotated_df.pop("Metadata_Site") + + # insert the columns in specific parts of the dataframe + annotated_df.insert(2, "Metadata_Well", well_column) + annotated_df.insert(3, "Metadata_Site", site_column) + annotated_df.insert(4, "Metadata_number_of_singlecells", singlecell_column) + + # save annotated df as parquet file + output( + df=annotated_df, + output_filename=output_annotated_file, + output_type="parquet", + ) + + # Step 2: Normalization + normalized_df = normalize( + profiles=output_annotated_file, + method="standardize", + output_file=output_normalized_file, + output_type="parquet", + ) + + # Step 3: Feature selection + feature_select( + output_normalized_file, + operation=feature_select_ops, + output_file=output_feature_select_file, + output_type="parquet" + ) + + # Step 4: Cameron's method of aggregation + feature_select_df = load_profiles(output_feature_select_file) + metadata_cols = infer_cp_features(feature_select_df, metadata=True) + metadata_cols = [x for x in metadata_cols if all(col not in x for col in cameron_unwanted_aggregate_cols)] + + aggregate_df = aggregate( + population_df=feature_select_df, + operation="median", + strata=metadata_cols, + output_file=output_aggregated_file, + output_type="parquet" + ) + + print(aggregate_df.shape) + diff --git a/3.processing_features/scripts/html/0.merge_sc_cytotable.html b/3.processing_features/scripts/html/0.merge_sc_cytotable.html new file mode 100644 index 0000000..554555f --- /dev/null +++ b/3.processing_features/scripts/html/0.merge_sc_cytotable.html @@ -0,0 +1,15168 @@ + + + + + +0.merge_sc_cytotable + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/3.processing_features/scripts/html/1.pycytominer_bulk_pipelines.html b/3.processing_features/scripts/html/1.pycytominer_bulk_pipelines.html new file mode 100644 index 0000000..d168811 --- /dev/null +++ b/3.processing_features/scripts/html/1.pycytominer_bulk_pipelines.html @@ -0,0 +1,14978 @@ + + + + + +1.pycytominer_bulk_pipelines + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/3.processing_features/scripts/html/2.pycytominer_singlecell_pipelines.html b/3.processing_features/scripts/html/2.pycytominer_singlecell_pipelines.html new file mode 100644 index 0000000..34d5d36 --- /dev/null +++ b/3.processing_features/scripts/html/2.pycytominer_singlecell_pipelines.html @@ -0,0 +1,14867 @@ + + + + + +2.pycytominer_singlecell_pipelines + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From c802d13dcc24cfe9b0960c5e8e733c105c97bab4 Mon Sep 17 00:00:00 2001 From: gwaybio Date: Fri, 4 Aug 2023 13:14:20 -0600 Subject: [PATCH 09/10] update plate info dict --- .../plate_info_dictionary.yaml | 40 +++---------------- 1 file changed, 5 insertions(+), 35 deletions(-) diff --git a/3.processing_features/plate_info_dictionary.yaml b/3.processing_features/plate_info_dictionary.yaml index bffb08a..01b8ea8 100644 --- a/3.processing_features/plate_info_dictionary.yaml +++ b/3.processing_features/plate_info_dictionary.yaml @@ -1,45 +1,15 @@ Plate_1: - annotated_path: data/annotated_data/Plate_1_sc.parquet - bulk_annotated_path: data/annotated_data/Plate_1_bulk_annotated.parquet - bulk_normalized_path: data/normalized_data/Plate_1_bulk_norm.parquet - bulk_path: data/aggregated_data/Plate_1_bulk.parquet dest_path: data/converted_data/Plate_1.parquet - normalized_path: data/normalized_data/Plate_1_sc_norm.parquet - platemap_path: /home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate1.csv - source_path: /home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite + source_path: /home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_1/Plate_1_nf1_analysis.sqlite Plate_2: - annotated_path: data/annotated_data/Plate_2_sc.parquet - bulk_annotated_path: data/annotated_data/Plate_2_bulk_annotated.parquet - bulk_normalized_path: data/normalized_data/Plate_2_bulk_norm.parquet - bulk_path: data/aggregated_data/Plate_2_bulk.parquet dest_path: data/converted_data/Plate_2.parquet - normalized_path: data/normalized_data/Plate_2_sc_norm.parquet - platemap_path: /home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate2.csv - source_path: /home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite + source_path: /home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_2/Plate_2_nf1_analysis.sqlite Plate_3: - annotated_path: data/annotated_data/Plate_3_sc.parquet - bulk_annotated_path: data/annotated_data/Plate_3_bulk_annotated.parquet - bulk_normalized_path: data/normalized_data/Plate_3_bulk_norm.parquet - bulk_path: data/aggregated_data/Plate_3_bulk.parquet dest_path: data/converted_data/Plate_3.parquet - normalized_path: data/normalized_data/Plate_3_sc_norm.parquet - platemap_path: /home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv - source_path: /home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite + source_path: /home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite Plate_3_prime: - annotated_path: data/annotated_data/Plate_3_prime_sc.parquet - bulk_annotated_path: data/annotated_data/Plate_3_prime_bulk_annotated.parquet - bulk_normalized_path: data/normalized_data/Plate_3_prime_bulk_norm.parquet - bulk_path: data/aggregated_data/Plate_3_prime_bulk.parquet dest_path: data/converted_data/Plate_3_prime.parquet - normalized_path: data/normalized_data/Plate_3_prime_sc_norm.parquet - platemap_path: /home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate3.csv - source_path: /home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite + source_path: /home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite Plate_4: - annotated_path: data/annotated_data/Plate_4_sc.parquet - bulk_annotated_path: data/annotated_data/Plate_4_bulk_annotated.parquet - bulk_normalized_path: data/normalized_data/Plate_4_bulk_norm.parquet - bulk_path: data/aggregated_data/Plate_4_bulk.parquet dest_path: data/converted_data/Plate_4.parquet - normalized_path: data/normalized_data/Plate_4_sc_norm.parquet - platemap_path: /home/jenna/nf1_cellpainting_data/0.download_data/metadata/platemap_NF1_plate4.csv - source_path: /home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite + source_path: /home/gway/repos/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite From bc9fce06f6241ddb48fdaacf1892339d3bd1f3c6 Mon Sep 17 00:00:00 2001 From: Gregory Way Date: Fri, 25 Aug 2023 13:39:14 -0600 Subject: [PATCH 10/10] address Jenna comments --- .../2.pycytominer_singlecell_pipelines.ipynb | 4 ++-- .../scripts/2.pycytominer_singlecell_pipelines.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/3.processing_features/2.pycytominer_singlecell_pipelines.ipynb b/3.processing_features/2.pycytominer_singlecell_pipelines.ipynb index e20489f..2b34254 100644 --- a/3.processing_features/2.pycytominer_singlecell_pipelines.ipynb +++ b/3.processing_features/2.pycytominer_singlecell_pipelines.ipynb @@ -153,7 +153,6 @@ " output_annotated_file = str(pathlib.Path(f\"{output_dir}/{plate}_sc_annotated.parquet\"))\n", " output_normalized_file = str(pathlib.Path(f\"{output_dir}/{plate}_sc_normalized.parquet\"))\n", " output_feature_select_file = str(pathlib.Path(f\"{output_dir}/{plate}_sc_feature_selected.parquet\"))\n", - " output_feature_select_file = str(pathlib.Path(f\"{output_dir}/{plate}_sc_feature_selected.parquet\"))\n", " output_aggregated_file = str(pathlib.Path(f\"{output_dir}/{plate}_bulk_camerons_method.parquet\"))\n", "\n", " # Load single-cell profiles\n", @@ -208,6 +207,7 @@ "\n", " # Step 4: Cameron's method of aggregation\n", " feature_select_df = load_profiles(output_feature_select_file)\n", + " # Specify metadata columns in aggregation step to ensure they are retained for downstream analysis\n", " metadata_cols = infer_cp_features(feature_select_df, metadata=True)\n", " metadata_cols = [x for x in metadata_cols if all(col not in x for col in cameron_unwanted_aggregate_cols)]\n", " \n", @@ -239,7 +239,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.15" + "version": "3.10.8" } }, "nbformat": 4, diff --git a/3.processing_features/scripts/2.pycytominer_singlecell_pipelines.py b/3.processing_features/scripts/2.pycytominer_singlecell_pipelines.py index a847796..ff1af2b 100644 --- a/3.processing_features/scripts/2.pycytominer_singlecell_pipelines.py +++ b/3.processing_features/scripts/2.pycytominer_singlecell_pipelines.py @@ -87,7 +87,6 @@ output_annotated_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_annotated.parquet")) output_normalized_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_normalized.parquet")) output_feature_select_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_feature_selected.parquet")) - output_feature_select_file = str(pathlib.Path(f"{output_dir}/{plate}_sc_feature_selected.parquet")) output_aggregated_file = str(pathlib.Path(f"{output_dir}/{plate}_bulk_camerons_method.parquet")) # Load single-cell profiles @@ -142,6 +141,7 @@ # Step 4: Cameron's method of aggregation feature_select_df = load_profiles(output_feature_select_file) + # Specify metadata columns in aggregation step to ensure they are retained for downstream analysis metadata_cols = infer_cp_features(feature_select_df, metadata=True) metadata_cols = [x for x in metadata_cols if all(col not in x for col in cameron_unwanted_aggregate_cols)]