Merge pull request #16 from yuehhua/develop

Refine figures
yuehhua · Jun 30, 2022 · e7f7fff · e7f7fff
2 parents 0302b92 + 4473084
commit e7f7fff
Show file tree

Hide file tree

Showing 21 changed files with 371 additions and 316 deletions.
diff --git a/Project.toml b/Project.toml
@@ -12,7 +12,6 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
-Gadfly = "c91e804a-d5a3-530f-b6f0-dfbca275c004"
 GaussianMixtures = "cc18c42c-b769-54ff-9e2a-b28141a64aae"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5"
@@ -39,11 +38,10 @@ Distances = "0.10"
 Distributions = "0.24 - 0.25"
 FileIO = "1.14"
 GLM = "1"
-Gadfly = "1"
 GaussianMixtures = "0.3"
 Graphs = "1"
 HypothesisTests = "0.10"
-Missings = "0.4"
+Missings = "1"
 MultivariateStats = "0.9"
 PlotlyBase = "0.8"
 Plots = "1.30"

diff --git a/notebook/GMM_tf_and_targ_bonemarrow.jl b/notebook/GMM_tf_and_targ_bonemarrow.jl
@@ -1,6 +1,6 @@
 using Logging
 
-using CDGRN
+using CDGRNs
 using SnowyOwl
 using DataFrames
 using JLD2
@@ -11,9 +11,9 @@ Gadfly.set_default_plot_size(8inch, 6inch)
 
 ## Load data
 
-dir = joinpath(CDGRN.PROJECT_PATH, "results", "bonemarrow")
+dir = joinpath(CDGRNs.PROJECT_PATH, "results", "bonemarrow")
 prof = load_profile(dir)
-tf_set = CDGRN.load_tfs(joinpath(dir, "tf_set.jld2"))
+tf_set = CDGRNs.load_tfs(joinpath(dir, "tf_set.jld2"))
 tfs = select_genes!(copy(prof), tf_set)
 
 select_high_likelihood!(prof)
@@ -32,7 +32,7 @@ function log_likelihood_plot(df, tf_name, gene_name, mix_logpdf;
     coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
     xlabel = "log spliced RNA of TF gene, $(tf_name)"
     ylabel = "log unspliced RNA of target gene, $(gene_name)"
-    filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "bonemarrow", "$(tf_name)-$(gene_name) log likelihood plot.svg")
+    filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "bonemarrow", "$(tf_name)-$(gene_name) log likelihood plot.svg")
     plot(l1, l2, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
 end
 
@@ -42,7 +42,7 @@ function cluster_plot(df, tf_name, gene_name; xmax=ceil(maximum(df.logX)), xmin=
     coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
     xlabel = "log spliced RNA of TF gene, $(tf_name)"
     ylabel = "log unspliced RNA of target gene, $(gene_name)"
-    filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "bonemarrow", "$(tf_name)-$(gene_name) log cluster plot.svg")
+    filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "bonemarrow", "$(tf_name)-$(gene_name) log cluster plot.svg")
     plot(l, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
 end
 

diff --git a/notebook/GMM_tf_and_targ_dentategyrus.jl b/notebook/GMM_tf_and_targ_dentategyrus.jl
@@ -1,6 +1,6 @@
 using Logging
 
-using CDGRN
+using CDGRNs
 using SnowyOwl
 using DataFrames
 using JLD2
@@ -11,9 +11,9 @@ Gadfly.set_default_plot_size(8inch, 6inch)
 
 ## Load data
 
-dir = joinpath(CDGRN.PROJECT_PATH, "results", "dentategyrus")
+dir = joinpath(CDGRNs.PROJECT_PATH, "results", "dentategyrus")
 prof = load_profile(dir)
-tf_set = CDGRN.load_tfs(joinpath(dir, "tf_set.jld2"))
+tf_set = CDGRNs.load_tfs(joinpath(dir, "tf_set.jld2"))
 tfs = select_genes!(copy(prof), tf_set)
 
 select_high_likelihood!(prof)
@@ -32,7 +32,7 @@ function log_likelihood_plot(df, tf_name, gene_name, mix_logpdf;
     coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
     xlabel = "log spliced RNA of TF gene, $(tf_name)"
     ylabel = "log unspliced RNA of target gene, $(gene_name)"
-    filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "dentategyrus", "$(tf_name)-$(gene_name) log likelihood plot.svg")
+    filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "dentategyrus", "$(tf_name)-$(gene_name) log likelihood plot.svg")
     plot(l1, l2, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
 end
 
@@ -42,7 +42,7 @@ function cluster_plot(df, tf_name, gene_name; xmax=ceil(maximum(df.logX)), xmin=
     coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
     xlabel = "log spliced RNA of TF gene, $(tf_name)"
     ylabel = "log unspliced RNA of target gene, $(gene_name)"
-    filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "dentategyrus", "$(tf_name)-$(gene_name) log cluster plot.svg")
+    filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "dentategyrus", "$(tf_name)-$(gene_name) log cluster plot.svg")
     plot(l, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
 end
 

diff --git a/notebook/GMM_tf_and_targ_gastrulation_erythroid.jl b/notebook/GMM_tf_and_targ_gastrulation_erythroid.jl
@@ -1,6 +1,6 @@
 using Logging
 
-using CDGRN
+using CDGRNs
 using SnowyOwl
 using DataFrames
 using JLD2
@@ -11,9 +11,9 @@ Gadfly.set_default_plot_size(8inch, 6inch)
 
 ## Load data
 
-dir = joinpath(CDGRN.PROJECT_PATH, "results", "gastrulation_erythroid")
+dir = joinpath(CDGRNs.PROJECT_PATH, "results", "gastrulation_erythroid")
 prof = load_profile(dir)
-tf_set = CDGRN.load_tfs(joinpath(dir, "tf_set.jld2"))
+tf_set = CDGRNs.load_tfs(joinpath(dir, "tf_set.jld2"))
 tfs = select_genes!(copy(prof), tf_set)
 
 select_high_likelihood!(prof)
@@ -32,7 +32,7 @@ function log_likelihood_plot(df, tf_name, gene_name, mix_logpdf;
     coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
     xlabel = "log spliced RNA of TF gene, $(tf_name)"
     ylabel = "log unspliced RNA of target gene, $(gene_name)"
-    filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "gastrulation_erythroid", "$(tf_name)-$(gene_name) log likelihood plot.svg")
+    filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "gastrulation_erythroid", "$(tf_name)-$(gene_name) log likelihood plot.svg")
     plot(l1, l2, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
 end
 
@@ -42,7 +42,7 @@ function cluster_plot(df, tf_name, gene_name; xmax=ceil(maximum(df.logX)), xmin=
     coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
     xlabel = "log spliced RNA of TF gene, $(tf_name)"
     ylabel = "log unspliced RNA of target gene, $(gene_name)"
-    filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "gastrulation_erythroid", "$(tf_name)-$(gene_name) log cluster plot.svg")
+    filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "gastrulation_erythroid", "$(tf_name)-$(gene_name) log cluster plot.svg")
     plot(l, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
 end
 

diff --git a/notebook/GMM_tf_and_targ_pancreas.jl b/notebook/GMM_tf_and_targ_pancreas.jl
@@ -1,6 +1,6 @@
 using Logging
 
-using CDGRN
+using CDGRNs
 using SnowyOwl
 using DataFrames
 using JLD2
@@ -11,9 +11,9 @@ Gadfly.set_default_plot_size(8inch, 6inch)
 
 ## Load data
 
-dir = joinpath(CDGRN.PROJECT_PATH, "results", "pancreas")
+dir = joinpath(CDGRNs.PROJECT_PATH, "results", "pancreas")
 prof = load_profile(dir)
-tf_set = CDGRN.load_tfs(joinpath(dir, "tf_set.jld2"))
+tf_set = CDGRNs.load_tfs(joinpath(dir, "tf_set.jld2"))
 tfs = select_genes!(copy(prof), tf_set)
 
 select_high_likelihood!(prof)
@@ -32,7 +32,7 @@ function log_likelihood_plot(df, tf_name, gene_name, mix_logpdf;
     coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
     xlabel = "log spliced RNA of TF gene, $(tf_name)"
     ylabel = "log unspliced RNA of target gene, $(gene_name)"
-    filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "tf-gene gmm model", "$(tf_name)-$(gene_name) log likelihood plot.svg")
+    filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "tf-gene gmm model", "$(tf_name)-$(gene_name) log likelihood plot.svg")
     plot(l1, l2, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
 end
 
@@ -42,7 +42,7 @@ function cluster_plot(df, tf_name, gene_name; xmax=ceil(maximum(df.logX)), xmin=
     coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
     xlabel = "log spliced RNA of TF gene, $(tf_name)"
     ylabel = "log unspliced RNA of target gene, $(gene_name)"
-    filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "tf-gene gmm model", "$(tf_name)-$(gene_name) log cluster plot.svg")
+    filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "tf-gene gmm model", "$(tf_name)-$(gene_name) log cluster plot.svg")
     plot(l, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
 end
 

diff --git a/notebook/PCA_pancreas.jl b/notebook/PCA_pancreas.jl
@@ -1,4 +1,4 @@
-using CDGRN
+using CDGRNs
 using DataFrames
 using CSV
 using JLD2
@@ -10,10 +10,10 @@ default(size = (800, 600))
 
 ## Load data
 
-dir = joinpath(CDGRN.PROJECT_PATH, "results", "pancreas")
-fig_dir = joinpath(CDGRN.PROJECT_PATH, "pics", "tf-gene gmm model")
+dir = joinpath(CDGRNs.PROJECT_PATH, "results", "pancreas")
+fig_dir = joinpath(CDGRNs.PROJECT_PATH, "pics", "tf-gene gmm model")
 prof = load_profile(dir)
-tf_set = CDGRN.load_tfs(joinpath(dir, "tf_set.jld2"))
+tf_set = CDGRNs.load_tfs(joinpath(dir, "tf_set.jld2"))
 tfs = select_genes!(copy(prof), tf_set)
 
 select_high_likelihood!(prof)
@@ -29,7 +29,7 @@ cor_pairs, nonsingle_pairs = regulation_correlation(filename)
 true_regulations, true_reg_pairs = remove_spurious_pairs(cor_pairs, nonsingle_pairs)
 
 df = get_regulation_expr(prof, tfs, true_regulations)
-pc = CDGRN.pca_transform(Array(df[:, 3:end])', dims=5)
+pc = CDGRNs.pca_transform(Array(df[:, 3:end])', dims=5)
 df.PC1 = pc[:, 1]
 df.PC2 = pc[:, 2]
 df.PC3 = pc[:, 3]

diff --git a/notebook/UMAP_bonemarrow.jl b/notebook/UMAP_bonemarrow.jl
@@ -1,19 +1,16 @@
-using CDGRN
+using CDGRNs
 using DataFrames
-using CSV
-using JLD2
-using FileIO
 using UMAP
 using Plots
 using StatsPlots
 gr()
 
 ## Load data
 
-dir = joinpath(CDGRN.PROJECT_PATH, "results", "bonemarrow")
-fig_dir = joinpath(CDGRN.PROJECT_PATH, "pics", "bonemarrow")
+dir = joinpath(CDGRNs.PROJECT_PATH, "results", "bonemarrow")
+fig_dir = joinpath(CDGRNs.PROJECT_PATH, "pics", "bonemarrow")
 prof = load_profile(dir)
-tf_set = CDGRN.load_tfs(joinpath(dir, "tf_set.jld2"))
+tf_set = CDGRNs.load_tfs(joinpath(dir, "tf_set.jld2"))
 tfs = select_genes!(copy(prof), tf_set)
 
 select_high_likelihood!(prof)
@@ -28,20 +25,68 @@ filename = joinpath(dir, "GMM-model-selection-result.jld2")
 cor_pairs, nonsingle_pairs = regulation_correlation(filename)
 true_regulations, true_reg_pairs = remove_spurious_pairs(cor_pairs, nonsingle_pairs)
 
+k = 9
+tree, cell_clusters = build_tree(prof, true_reg_pairs, col_palette=:default)
+extract_context!(cell_clusters, tree, k)
+
 df = get_regulation_expr(prof, tfs, true_regulations)
 embedding = umap(Array(df[:, 3:end])', 2, n_neighbors=20, min_dist=0.5)
 df.UMAP1 = embedding[1, :]
 df.UMAP2 = embedding[2, :]
+df.k9 = cell_clusters.k9
+
+plot_configs = (markersize=2, markerstrokewidth=0, dpi=300, size=(1000, 600), thickness_scaling=2, widen=false)
 
-plot_configs = (markersize=2, markerstrokewidth=0, dpi=300)
+p = @df df scatter(:UMAP1, :UMAP2; group=:cell, xlabel="UMAP1", ylabel="UMAP2", color_palette=:glasbey_hv_n256,
+    legend=:outertopright, markersize=2, markerstrokewidth=0,
+    dpi=300, size=(1000, 600), thickness_scaling=2, widen=false)
+savefig(p, joinpath(fig_dir, "UMAP", "umap-cell type.svg"))
+savefig(p, joinpath(fig_dir, "UMAP", "umap-cell type.png"))
 
-@df df scatter(:UMAP1, :UMAP2; group=:cell, xlabel="UMAP1", ylabel="UMAP2", color_palette=:glasbey_hv_n256, legend=:outertopright, plot_configs...)
-savefig(joinpath(fig_dir, "UMAP", "umap-cell type.svg"))
-savefig(joinpath(fig_dir, "UMAP", "umap-cell type.png"))
+p = @df df scatter(:UMAP1, :UMAP2; group=:k9, xlabel="UMAP1", ylabel="UMAP2", color_palette=:glasbey_hv_n256,
+    legend=:outerright, markersize=2, markerstrokewidth=0,
+    dpi=300, size=(800, 600), thickness_scaling=2, widen=false)
+savefig(p, joinpath(fig_dir, "UMAP", "umap-context.svg"))
+savefig(p, joinpath(fig_dir, "UMAP", "umap-context.png"))
 
-@df df scatter(:UMAP1, :UMAP2; zcolor=:time, c=:coolwarm, xlabel="UMAP1", ylabel="UMAP2",
+p = @df df scatter(:UMAP1, :UMAP2; zcolor=:time, c=:coolwarm, xlabel="UMAP1", ylabel="UMAP2",
                legend=false, cb=:outerright, plot_configs...)
-savefig(joinpath(fig_dir, "UMAP", "umap-latent time.svg"))
-savefig(joinpath(fig_dir, "UMAP", "umap-latent time.png"))
+savefig(p, joinpath(fig_dir, "UMAP", "umap-latent time.svg"))
+savefig(p, joinpath(fig_dir, "UMAP", "umap-latent time.png"))
+
+
+# Train CDGRNs over contexts
+
+cortable = train_cdgrns(tfs, prof, true_regulations,
+    cell_clusters.k9, [2, 3, 6, 7, 8],
+    joinpath(dir, "cdgrn_k9"))
+
+
+# Network entropy
+
+cdgrn_stats = DataFrame(cntx=String[], V=Int[], E=Int[], entropy=Float64[])
+for i in [2, 7, 6, 3]
+    E = nrow(cortable[i])
+    V = length(unique(vcat(cortable[i].tf, cortable[i].target)))
+    cortable[i].dist = cor2dist.(cortable[i].ρ)
+    entr = network_entropy(to_graph(cortable[i]))
+    push!(cdgrn_stats, ("$i", V, E, entr))
+end
+cdgrn_stats[!, :order] = [1, 2, 3, 4]
+
+p = @df cdgrn_stats plot(:order, [:V, :E],
+    label=["Number of node" "Number of edge"],
+    xlabel="Context", ylabel="Network size", legend=:topright,
+    thickness_scaling=2, widen=false, dpi=300, size=(800, 600)
+)
+xticks!([1:4;], cdgrn_stats[!,:cntx])
+filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "bonemarrow", "CDGRN", "network_size k9.png")
+savefig(p, filepath)
 
-CSV.write(joinpath(dir, "umap.tsv"), df, delim='\t')
+p = @df cdgrn_stats plot(:order, :entropy, label="Network entropy",
+    xlabel="Context", ylabel="Network entropy", legend=:topright,
+    thickness_scaling=2, widen=false, dpi=300, size=(800, 600)
+)
+xticks!([1:4;], cdgrn_stats[!,:cntx])
+filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "bonemarrow", "CDGRN", "network_entropy k9.png")
+savefig(p, filepath)