From b1a4b0ccb84163b660564230e68fdc28f092b8c8 Mon Sep 17 00:00:00 2001 From: Jalil Nourisa Date: Tue, 26 Nov 2024 11:19:56 +0100 Subject: [PATCH] test run --- runs.ipynb | 68 +++++++++++++------ src/metrics/script_all.py | 4 +- .../multiomics/format_data/script.py | 6 +- 3 files changed, 53 insertions(+), 25 deletions(-) diff --git a/runs.ipynb b/runs.ipynb index 75855173..deae5244 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -26,11 +26,23 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "upload: resources_test/datasets_raw/op_perturbation_sc_counts.h5ad to s3://openproblems-data/resources_test/grn/datasets_raw/op_perturbation_sc_counts.h5ad\n", + "upload: resources_test/datasets_raw/op_multiome_sc_counts.h5ad to s3://openproblems-data/resources_test/grn/datasets_raw/op_multiome_sc_counts.h5ad\n", + "upload: resources_test/inference_datasets/op_rna.h5ad to s3://openproblems-data/resources_test/grn/inference_datasets/op_rna.h5ad\n", + "upload: resources_test/evaluation_datasets/op_perturbation.h5ad to s3://openproblems-data/resources_test/grn/evaluation_datasets/op_perturbation.h5ad\n", + "upload: resources_test/inference_datasets/op_atac.h5ad to s3://openproblems-data/resources_test/grn/inference_datasets/op_atac.h5ad\n" + ] + } + ], "source": [ - "# !aws s3 sync resources_test/ s3://openproblems-data/resources_test/grn/ --delete\n", + "!aws s3 sync resources_test/ s3://openproblems-data/resources_test/grn/ --delete\n", "# !aws s3 sync resources/ s3://openproblems-data/resources/grn/ --delete" ] }, @@ -43,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -81,11 +93,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs × n_vars = 25551 × 22787\n", + " obs: 'cell_type', 'donor_id'\n", + " var: 'gene_ids', 'interval'\n", + " layers: 'X_norm', 'counts'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "ad.read('resources/')" + "ad.read('resources/inference_datasets/op_rna.h5ad')" ] }, { @@ -144,14 +170,14 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Submitted batch job 7845525\n" + "Submitted batch job 7849254\n" ] } ], @@ -195,7 +221,7 @@ } ], "source": [ - "!ls output/temp/op/" + "!ls resources/scores/op/" ] }, { @@ -578,7 +604,7 @@ } ], "source": [ - "df_scores = pd.read_csv(f\"output/temp/op/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", + "df_scores = pd.read_csv(f\"resources/scores/op/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", "# df_scores[df_scores<0] = 0\n", "df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", "df_scores['rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n", @@ -935,7 +961,7 @@ } ], "source": [ - "df_scores = pd.read_csv(f\"output/temp/op/50000-skeleton_False-binarize_True-GB.csv\", index_col=0)\n", + "df_scores = pd.read_csv(f\"resources/scores/op/50000-skeleton_False-binarize_True-GB.csv\", index_col=0)\n", "# df_scores[df_scores<0] = 0\n", "df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", "df_scores['rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n", @@ -1164,7 +1190,7 @@ } ], "source": [ - "df_scores = pd.read_csv(f\"output/temp/replogle2/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", + "df_scores = pd.read_csv(f\"resources/scores/replogle2/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", "# df_scores[df_scores<0] = 0\n", "\n", "df_scores_f = df_scores[['static-theta-0.0', 'static-theta-0.5', 'static-theta-1.0']]\n", @@ -1403,7 +1429,7 @@ } ], "source": [ - "df_scores = pd.read_csv(f\"output/temp/nakatake/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", + "df_scores = pd.read_csv(f\"resources/scores/nakatake/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", "# df_scores[df_scores<0] = 0\n", "df_scores_f = df_scores[['static-theta-0.0', 'static-theta-0.5', 'static-theta-1.0']]\n", "df_all_n = (df_scores_f-df_scores_f.min(axis=0))/(df_scores_f.max(axis=0)-df_scores_f.min(axis=0))\n", @@ -1649,7 +1675,7 @@ } ], "source": [ - "df_scores = pd.read_csv(f\"output/temp/norman/X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\", index_col=0)\n", + "df_scores = pd.read_csv(f\"resources/scores/norman/X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\", index_col=0)\n", "# df_scores[df_scores<0] = 0\n", "df_scores_f = df_scores[['static-theta-0.0', 'static-theta-0.5', 'static-theta-1.0']]\n", "df_all_n = (df_scores_f-df_scores_f.min(axis=0))/(df_scores_f.max(axis=0)-df_scores_f.min(axis=0))\n", @@ -1684,7 +1710,7 @@ } ], "source": [ - "!ls output/temp/adamson/" + "!ls resources/scores/adamson/" ] }, { @@ -1884,7 +1910,7 @@ } ], "source": [ - "df_scores = pd.read_csv(f\"output/temp/adamson/X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\", index_col=0)\n", + "df_scores = pd.read_csv(f\"resources/scores/adamson/X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\", index_col=0)\n", "# df_scores[df_scores<0] = 0\n", "df_scores_f = df_scores[['static-theta-0.0', 'static-theta-0.5', 'static-theta-1.0']]\n", "df_all_n = (df_scores_f-df_scores_f.min(axis=0))/(df_scores_f.max(axis=0)-df_scores_f.min(axis=0))\n", @@ -1930,7 +1956,7 @@ } ], "source": [ - "!ls output/temp/op/" + "!ls resources/scores/op/" ] }, { @@ -2511,8 +2537,8 @@ } ], "source": [ - "df_scores_gb = pd.read_csv(f\"output/temp/op/X_norm-50000-skeleton_False-binarize_True-ridge-global-True.csv\", index_col=0)\n", - "df_scores = pd.read_csv(f\"output/temp/op/X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\", index_col=0)\n", + "df_scores_gb = pd.read_csv(f\"resources/scores/op/X_norm-50000-skeleton_False-binarize_True-ridge-global-True.csv\", index_col=0)\n", + "df_scores = pd.read_csv(f\"resources/scores/op/X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\", index_col=0)\n", "\n", "df_scores = pd.concat([df_scores, df_scores_gb])\n", "# df_scores[df_scores<0] = 0\n", @@ -2580,7 +2606,7 @@ "source": [ "# - collect all the scores\n", "for i, dataset in enumerate(datasets):\n", - " df_scores = pd.read_csv(f\"output/temp/{dataset}/X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\", index_col=0)\n", + " df_scores = pd.read_csv(f\"resources/scores/{dataset}/X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\", index_col=0)\n", " # - normalize scores \n", " df_scores = df_scores.fillna(0)\n", " df_scores[df_scores < 0] = 0\n", diff --git a/src/metrics/script_all.py b/src/metrics/script_all.py index 1fa5fb37..94b56404 100644 --- a/src/metrics/script_all.py +++ b/src/metrics/script_all.py @@ -69,7 +69,7 @@ def define_par(dataset): global_models = False # - run metrics -for dataset in ['adamson']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson' +for dataset in ['op']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson' print('------ ', dataset, '------') par = define_par(dataset) os.makedirs(par['scores_dir'], exist_ok=True) @@ -78,7 +78,7 @@ def define_par(dataset): par['binarize'] = binarize for max_n_links in [50000]: par['max_n_links'] = max_n_links - for apply_skeleton in [False]: + for apply_skeleton in [True]: par['apply_skeleton'] = apply_skeleton # - determines models to run grn_files_dict = {} diff --git a/src/process_data/multiomics/format_data/script.py b/src/process_data/multiomics/format_data/script.py index c4251578..6f733916 100644 --- a/src/process_data/multiomics/format_data/script.py +++ b/src/process_data/multiomics/format_data/script.py @@ -16,8 +16,6 @@ multiomics.X = multiomics.layers['counts'] del multiomics.layers multiomics.layers['counts'] = multiomics.X.copy() -X_norm = sc.pp.normalize_total(multiomics, inplace=False)['X'] -multiomics.layers['X_norm'] = sc.pp.log1p(X_norm, copy=True) multiomics.var.index.name='location' multiomics.obs.index.name='obs_id' @@ -58,5 +56,9 @@ multiomics_rna.obs['donor_id'] = multiomics_rna.obs['donor_id'].map(donor_map) multiomics_atac.obs['donor_id'] = multiomics_atac.obs['donor_id'].map(donor_map) +# normalize rna +X_norm = sc.pp.normalize_total(multiomics_rna, inplace=False)['X'] +multiomics_rna.layers['X_norm'] = sc.pp.log1p(X_norm, copy=True) + multiomics_rna.write(par['multiomics_rna']) multiomics_atac.write(par['multiomics_atac']) \ No newline at end of file