Skip to content

Commit

Permalink
Merge pull request #16 from yuehhua/develop
Browse files Browse the repository at this point in the history
Refine figures
  • Loading branch information
yuehhua authored Jun 30, 2022
2 parents 0302b92 + 4473084 commit e7f7fff
Show file tree
Hide file tree
Showing 21 changed files with 371 additions and 316 deletions.
4 changes: 1 addition & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
Gadfly = "c91e804a-d5a3-530f-b6f0-dfbca275c004"
GaussianMixtures = "cc18c42c-b769-54ff-9e2a-b28141a64aae"
Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5"
Expand All @@ -39,11 +38,10 @@ Distances = "0.10"
Distributions = "0.24 - 0.25"
FileIO = "1.14"
GLM = "1"
Gadfly = "1"
GaussianMixtures = "0.3"
Graphs = "1"
HypothesisTests = "0.10"
Missings = "0.4"
Missings = "1"
MultivariateStats = "0.9"
PlotlyBase = "0.8"
Plots = "1.30"
Expand Down
10 changes: 5 additions & 5 deletions notebook/GMM_tf_and_targ_bonemarrow.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
using Logging

using CDGRN
using CDGRNs
using SnowyOwl
using DataFrames
using JLD2
Expand All @@ -11,9 +11,9 @@ Gadfly.set_default_plot_size(8inch, 6inch)

## Load data

dir = joinpath(CDGRN.PROJECT_PATH, "results", "bonemarrow")
dir = joinpath(CDGRNs.PROJECT_PATH, "results", "bonemarrow")
prof = load_profile(dir)
tf_set = CDGRN.load_tfs(joinpath(dir, "tf_set.jld2"))
tf_set = CDGRNs.load_tfs(joinpath(dir, "tf_set.jld2"))
tfs = select_genes!(copy(prof), tf_set)

select_high_likelihood!(prof)
Expand All @@ -32,7 +32,7 @@ function log_likelihood_plot(df, tf_name, gene_name, mix_logpdf;
coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
xlabel = "log spliced RNA of TF gene, $(tf_name)"
ylabel = "log unspliced RNA of target gene, $(gene_name)"
filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "bonemarrow", "$(tf_name)-$(gene_name) log likelihood plot.svg")
filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "bonemarrow", "$(tf_name)-$(gene_name) log likelihood plot.svg")
plot(l1, l2, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
end

Expand All @@ -42,7 +42,7 @@ function cluster_plot(df, tf_name, gene_name; xmax=ceil(maximum(df.logX)), xmin=
coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
xlabel = "log spliced RNA of TF gene, $(tf_name)"
ylabel = "log unspliced RNA of target gene, $(gene_name)"
filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "bonemarrow", "$(tf_name)-$(gene_name) log cluster plot.svg")
filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "bonemarrow", "$(tf_name)-$(gene_name) log cluster plot.svg")
plot(l, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
end

Expand Down
10 changes: 5 additions & 5 deletions notebook/GMM_tf_and_targ_dentategyrus.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
using Logging

using CDGRN
using CDGRNs
using SnowyOwl
using DataFrames
using JLD2
Expand All @@ -11,9 +11,9 @@ Gadfly.set_default_plot_size(8inch, 6inch)

## Load data

dir = joinpath(CDGRN.PROJECT_PATH, "results", "dentategyrus")
dir = joinpath(CDGRNs.PROJECT_PATH, "results", "dentategyrus")
prof = load_profile(dir)
tf_set = CDGRN.load_tfs(joinpath(dir, "tf_set.jld2"))
tf_set = CDGRNs.load_tfs(joinpath(dir, "tf_set.jld2"))
tfs = select_genes!(copy(prof), tf_set)

select_high_likelihood!(prof)
Expand All @@ -32,7 +32,7 @@ function log_likelihood_plot(df, tf_name, gene_name, mix_logpdf;
coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
xlabel = "log spliced RNA of TF gene, $(tf_name)"
ylabel = "log unspliced RNA of target gene, $(gene_name)"
filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "dentategyrus", "$(tf_name)-$(gene_name) log likelihood plot.svg")
filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "dentategyrus", "$(tf_name)-$(gene_name) log likelihood plot.svg")
plot(l1, l2, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
end

Expand All @@ -42,7 +42,7 @@ function cluster_plot(df, tf_name, gene_name; xmax=ceil(maximum(df.logX)), xmin=
coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
xlabel = "log spliced RNA of TF gene, $(tf_name)"
ylabel = "log unspliced RNA of target gene, $(gene_name)"
filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "dentategyrus", "$(tf_name)-$(gene_name) log cluster plot.svg")
filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "dentategyrus", "$(tf_name)-$(gene_name) log cluster plot.svg")
plot(l, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
end

Expand Down
10 changes: 5 additions & 5 deletions notebook/GMM_tf_and_targ_gastrulation_erythroid.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
using Logging

using CDGRN
using CDGRNs
using SnowyOwl
using DataFrames
using JLD2
Expand All @@ -11,9 +11,9 @@ Gadfly.set_default_plot_size(8inch, 6inch)

## Load data

dir = joinpath(CDGRN.PROJECT_PATH, "results", "gastrulation_erythroid")
dir = joinpath(CDGRNs.PROJECT_PATH, "results", "gastrulation_erythroid")
prof = load_profile(dir)
tf_set = CDGRN.load_tfs(joinpath(dir, "tf_set.jld2"))
tf_set = CDGRNs.load_tfs(joinpath(dir, "tf_set.jld2"))
tfs = select_genes!(copy(prof), tf_set)

select_high_likelihood!(prof)
Expand All @@ -32,7 +32,7 @@ function log_likelihood_plot(df, tf_name, gene_name, mix_logpdf;
coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
xlabel = "log spliced RNA of TF gene, $(tf_name)"
ylabel = "log unspliced RNA of target gene, $(gene_name)"
filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "gastrulation_erythroid", "$(tf_name)-$(gene_name) log likelihood plot.svg")
filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "gastrulation_erythroid", "$(tf_name)-$(gene_name) log likelihood plot.svg")
plot(l1, l2, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
end

Expand All @@ -42,7 +42,7 @@ function cluster_plot(df, tf_name, gene_name; xmax=ceil(maximum(df.logX)), xmin=
coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
xlabel = "log spliced RNA of TF gene, $(tf_name)"
ylabel = "log unspliced RNA of target gene, $(gene_name)"
filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "gastrulation_erythroid", "$(tf_name)-$(gene_name) log cluster plot.svg")
filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "gastrulation_erythroid", "$(tf_name)-$(gene_name) log cluster plot.svg")
plot(l, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
end

Expand Down
10 changes: 5 additions & 5 deletions notebook/GMM_tf_and_targ_pancreas.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
using Logging

using CDGRN
using CDGRNs
using SnowyOwl
using DataFrames
using JLD2
Expand All @@ -11,9 +11,9 @@ Gadfly.set_default_plot_size(8inch, 6inch)

## Load data

dir = joinpath(CDGRN.PROJECT_PATH, "results", "pancreas")
dir = joinpath(CDGRNs.PROJECT_PATH, "results", "pancreas")
prof = load_profile(dir)
tf_set = CDGRN.load_tfs(joinpath(dir, "tf_set.jld2"))
tf_set = CDGRNs.load_tfs(joinpath(dir, "tf_set.jld2"))
tfs = select_genes!(copy(prof), tf_set)

select_high_likelihood!(prof)
Expand All @@ -32,7 +32,7 @@ function log_likelihood_plot(df, tf_name, gene_name, mix_logpdf;
coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
xlabel = "log spliced RNA of TF gene, $(tf_name)"
ylabel = "log unspliced RNA of target gene, $(gene_name)"
filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "tf-gene gmm model", "$(tf_name)-$(gene_name) log likelihood plot.svg")
filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "tf-gene gmm model", "$(tf_name)-$(gene_name) log likelihood plot.svg")
plot(l1, l2, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
end

Expand All @@ -42,7 +42,7 @@ function cluster_plot(df, tf_name, gene_name; xmax=ceil(maximum(df.logX)), xmin=
coord = Coord.cartesian(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax)
xlabel = "log spliced RNA of TF gene, $(tf_name)"
ylabel = "log unspliced RNA of target gene, $(gene_name)"
filepath = joinpath(CDGRN.PROJECT_PATH, "pics", "tf-gene gmm model", "$(tf_name)-$(gene_name) log cluster plot.svg")
filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "tf-gene gmm model", "$(tf_name)-$(gene_name) log cluster plot.svg")
plot(l, coord, Guide.xlabel(xlabel), Guide.ylabel(ylabel)) |> SVG(filepath, 10inch, 6inch)
end

Expand Down
10 changes: 5 additions & 5 deletions notebook/PCA_pancreas.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using CDGRN
using CDGRNs
using DataFrames
using CSV
using JLD2
Expand All @@ -10,10 +10,10 @@ default(size = (800, 600))

## Load data

dir = joinpath(CDGRN.PROJECT_PATH, "results", "pancreas")
fig_dir = joinpath(CDGRN.PROJECT_PATH, "pics", "tf-gene gmm model")
dir = joinpath(CDGRNs.PROJECT_PATH, "results", "pancreas")
fig_dir = joinpath(CDGRNs.PROJECT_PATH, "pics", "tf-gene gmm model")
prof = load_profile(dir)
tf_set = CDGRN.load_tfs(joinpath(dir, "tf_set.jld2"))
tf_set = CDGRNs.load_tfs(joinpath(dir, "tf_set.jld2"))
tfs = select_genes!(copy(prof), tf_set)

select_high_likelihood!(prof)
Expand All @@ -29,7 +29,7 @@ cor_pairs, nonsingle_pairs = regulation_correlation(filename)
true_regulations, true_reg_pairs = remove_spurious_pairs(cor_pairs, nonsingle_pairs)

df = get_regulation_expr(prof, tfs, true_regulations)
pc = CDGRN.pca_transform(Array(df[:, 3:end])', dims=5)
pc = CDGRNs.pca_transform(Array(df[:, 3:end])', dims=5)
df.PC1 = pc[:, 1]
df.PC2 = pc[:, 2]
df.PC3 = pc[:, 3]
Expand Down
75 changes: 60 additions & 15 deletions notebook/UMAP_bonemarrow.jl
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
using CDGRN
using CDGRNs
using DataFrames
using CSV
using JLD2
using FileIO
using UMAP
using Plots
using StatsPlots
gr()

## Load data

dir = joinpath(CDGRN.PROJECT_PATH, "results", "bonemarrow")
fig_dir = joinpath(CDGRN.PROJECT_PATH, "pics", "bonemarrow")
dir = joinpath(CDGRNs.PROJECT_PATH, "results", "bonemarrow")
fig_dir = joinpath(CDGRNs.PROJECT_PATH, "pics", "bonemarrow")
prof = load_profile(dir)
tf_set = CDGRN.load_tfs(joinpath(dir, "tf_set.jld2"))
tf_set = CDGRNs.load_tfs(joinpath(dir, "tf_set.jld2"))
tfs = select_genes!(copy(prof), tf_set)

select_high_likelihood!(prof)
Expand All @@ -28,20 +25,68 @@ filename = joinpath(dir, "GMM-model-selection-result.jld2")
cor_pairs, nonsingle_pairs = regulation_correlation(filename)
true_regulations, true_reg_pairs = remove_spurious_pairs(cor_pairs, nonsingle_pairs)

k = 9
tree, cell_clusters = build_tree(prof, true_reg_pairs, col_palette=:default)
extract_context!(cell_clusters, tree, k)

df = get_regulation_expr(prof, tfs, true_regulations)
embedding = umap(Array(df[:, 3:end])', 2, n_neighbors=20, min_dist=0.5)
df.UMAP1 = embedding[1, :]
df.UMAP2 = embedding[2, :]
df.k9 = cell_clusters.k9

plot_configs = (markersize=2, markerstrokewidth=0, dpi=300, size=(1000, 600), thickness_scaling=2, widen=false)

plot_configs = (markersize=2, markerstrokewidth=0, dpi=300)
p = @df df scatter(:UMAP1, :UMAP2; group=:cell, xlabel="UMAP1", ylabel="UMAP2", color_palette=:glasbey_hv_n256,
legend=:outertopright, markersize=2, markerstrokewidth=0,
dpi=300, size=(1000, 600), thickness_scaling=2, widen=false)
savefig(p, joinpath(fig_dir, "UMAP", "umap-cell type.svg"))
savefig(p, joinpath(fig_dir, "UMAP", "umap-cell type.png"))

@df df scatter(:UMAP1, :UMAP2; group=:cell, xlabel="UMAP1", ylabel="UMAP2", color_palette=:glasbey_hv_n256, legend=:outertopright, plot_configs...)
savefig(joinpath(fig_dir, "UMAP", "umap-cell type.svg"))
savefig(joinpath(fig_dir, "UMAP", "umap-cell type.png"))
p = @df df scatter(:UMAP1, :UMAP2; group=:k9, xlabel="UMAP1", ylabel="UMAP2", color_palette=:glasbey_hv_n256,
legend=:outerright, markersize=2, markerstrokewidth=0,
dpi=300, size=(800, 600), thickness_scaling=2, widen=false)
savefig(p, joinpath(fig_dir, "UMAP", "umap-context.svg"))
savefig(p, joinpath(fig_dir, "UMAP", "umap-context.png"))

@df df scatter(:UMAP1, :UMAP2; zcolor=:time, c=:coolwarm, xlabel="UMAP1", ylabel="UMAP2",
p = @df df scatter(:UMAP1, :UMAP2; zcolor=:time, c=:coolwarm, xlabel="UMAP1", ylabel="UMAP2",
legend=false, cb=:outerright, plot_configs...)
savefig(joinpath(fig_dir, "UMAP", "umap-latent time.svg"))
savefig(joinpath(fig_dir, "UMAP", "umap-latent time.png"))
savefig(p, joinpath(fig_dir, "UMAP", "umap-latent time.svg"))
savefig(p, joinpath(fig_dir, "UMAP", "umap-latent time.png"))


# Train CDGRNs over contexts

cortable = train_cdgrns(tfs, prof, true_regulations,
cell_clusters.k9, [2, 3, 6, 7, 8],
joinpath(dir, "cdgrn_k9"))


# Network entropy

cdgrn_stats = DataFrame(cntx=String[], V=Int[], E=Int[], entropy=Float64[])
for i in [2, 7, 6, 3]
E = nrow(cortable[i])
V = length(unique(vcat(cortable[i].tf, cortable[i].target)))
cortable[i].dist = cor2dist.(cortable[i].ρ)
entr = network_entropy(to_graph(cortable[i]))
push!(cdgrn_stats, ("$i", V, E, entr))
end
cdgrn_stats[!, :order] = [1, 2, 3, 4]

p = @df cdgrn_stats plot(:order, [:V, :E],
label=["Number of node" "Number of edge"],
xlabel="Context", ylabel="Network size", legend=:topright,
thickness_scaling=2, widen=false, dpi=300, size=(800, 600)
)
xticks!([1:4;], cdgrn_stats[!,:cntx])
filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "bonemarrow", "CDGRN", "network_size k9.png")
savefig(p, filepath)

CSV.write(joinpath(dir, "umap.tsv"), df, delim='\t')
p = @df cdgrn_stats plot(:order, :entropy, label="Network entropy",
xlabel="Context", ylabel="Network entropy", legend=:topright,
thickness_scaling=2, widen=false, dpi=300, size=(800, 600)
)
xticks!([1:4;], cdgrn_stats[!,:cntx])
filepath = joinpath(CDGRNs.PROJECT_PATH, "pics", "bonemarrow", "CDGRN", "network_entropy k9.png")
savefig(p, filepath)
Loading

0 comments on commit e7f7fff

Please sign in to comment.