clusters overlap on umap #4

Roger-GOAT · 2021-05-17T14:27:58Z

Dear team, thank you for your wonderful software RISC. It is great!
However, I run umap and find the clusters crowd together and overlap (pic in the attachment)
And the code below (scRNA is my Seurat object with four batches day0, day2, day5, and day14.)
Thank you!

library(RISC)
library(Matrix)
library(ggplot2)
library(RColorBrewer)
Idents(scRNA) <- 'orig.ident'
sc0 <- subset(scRNA,idents='day0')
mat0 = sc0@assays$RNA@counts
coldat0 = sc0@meta.data
rawdata0 = data.frame(Symbol = rownames(mat0), RNA = "Gene Expression", row.names = rownames(mat0))
dat0 = readscdata(count = mat0, cell = coldat0, gene = rawdata0)
sc2 <- subset(scRNA,idents='day2')
mat2 = sc2@assays$RNA@counts
coldat2 = sc2@meta.data
rawdata2 = data.frame(Symbol = rownames(mat2), RNA = "Gene Expression", row.names = rownames(mat2))
dat2 = readscdata(count = mat2, cell = coldat2, gene = rawdata2)
sc5 <- subset(scRNA,idents='day5')
mat5 = sc5@assays$RNA@counts
coldat5 = sc5@meta.data
rawdata5 = data.frame(Symbol = rownames(mat5), RNA = "Gene Expression", row.names = rownames(mat5))
dat5 = readscdata(count = mat5, cell = coldat5, gene = rawdata5)
sc14 <- subset(scRNA,idents='day14')
mat14 = sc14@assays$RNA@counts
coldat14 = sc14@meta.data
rawdata14 = data.frame(Symbol = rownames(mat14), RNA = "Gene Expression", row.names = rownames(mat14))
dat14 = readscdata(count = mat14, cell = coldat14, gene = rawdata14)

FilterPlot(dat0)
FilterPlot(dat2)
FilterPlot(dat5)
FilterPlot(dat14)

Per0 <- function(obj0){
  count0 = obj0@assay$count
  umi.gene = Matrix::rowSums(count0)
  num0 = 100 # top 100 genes
  umi.gene.per = sort(umi.gene / sum(umi.gene), decreasing = TRUE)[1:num0]
  m0 = data.frame(Num = 1:num0, Per = as.numeric(umi.gene.per))
  g0 = ggplot(m0, aes(Num, Per)) +
    geom_bar(stat = 'identity') +
    labs(x = "Genes ranked by UMIs", y = "UMI(gene) / UMI(total)") +
    theme_classic(base_size = 12, base_line_size = 0)
  print(g0)
}
Per0(dat0)
Per0(dat2)
Per0(dat5)
Per0(dat14)

process0 <- function(obj0){
  # Filter cells and genes
  obj0 = scFilter(obj0, min.UMI = 1000, max.UMI = 40000, min.gene = 200, min.cell = 3)
  # Normalize the raw counts
  obj0 = scNormalize(obj0)
  # Find highly variable genes
  obj0 = scDisperse(obj0)
  # print(length(obj0@vargene))
  return(obj0)
}
dat0 = process0(dat0)
dat2 = process0(dat2)
dat5 = process0(dat5)
dat14 = process0(dat14)

par(mfrow = c(2, 2))
# Check Quality
FilterPlot(dat0)
FilterPlot(dat2)
FilterPlot(dat5)
FilterPlot(dat14)

Disper0 <- function(obj0){
  # Read the gene expression variance meta-data
  m0 = as.data.frame(obj0@metadata$dispersion)
  # Draw plots
  g0 = ggplot(m0, aes(Mean, SD)) +
    geom_point(size = 1) +
    geom_smooth(method = "loess", span = 0.85, formula = "y ~ x") +
    labs(x = "Average (gene expression)", y = "Standard deviation (gene expression)") +
    theme_bw(base_size = 16, base_line_size = 0)
  print(g0)
}
# Check Quality
Disper0(dat0)
Disper0(dat2)
Disper0(dat5)
Disper0(dat14)

var0 = Reduce(
  intersect, list(dat0@rowdata$Symbol, dat2@rowdata$Symbol, dat5@rowdata$Symbol, dat14@rowdata$Symbol))

# Choose a good reference dataset
data0 = list(dat0, dat2, dat5, dat14)

InPlot(data0, var.gene = var0, Std.cut = 0.99, ncore = 8, minPC = 16, nPC = 25)
saveRDS(data0,file = './result/RISC/data0.rds')
#weighted by “Cluster Num.” score > “Stv by PCs” score > “Kolmogorov-Smirnov” score,
#but if “Kolmogorov-Smirnov” value of one set (the red values in the violin plot) are extremely high, we do not use
#the set as the reference, since the eigenvector distribution in this set is abnormal.
data0 = list(dat14, dat0, dat2, dat5)
data0 = scMultiIntegrate(
  objects = data0, eigens = 14, add.Id = NULL, var.gene = var0,
  method = "RPCI", align = 'OLS', npc = 50, adjust = TRUE,
  ncore = 16, do.fast = "AUTO"
)
# Calculate UMAP.
data0 = scUMAP(data0, npc = 18, use = "PLS")
pls0 = as.matrix(data0@DimReduction$cell.pls)

library(FNN)
library(igraph)
# Obtain integrated cell-eigenvectors
pls0 = as.matrix(data0@DimReduction$cell.pls[,1:18])
data0 = scCluster(data0, slot = "cell.pls", neighbor = 3, method = "louvain", npc = 18)
print(table(data0@coldata$Cluster))
DimPlot(data0, slot = "cell.umap", colFactor = "Cluster",label = T)

The text was updated successfully, but these errors were encountered:

bioinfoDZ · 2021-06-17T13:41:14Z

Hi Roger, sorry to reply a little late, I check the issue periodically.

Usually, we do not evaluate other groups' analyses, because people can use the codes and paramters they want. Here I want to remind you two things:

(1) in functions {scMultiIntegrate, scUMAP, scCluster}, it is better to use the same number of PCs, eigens == npc (scUMAP) == npc (scCluster). This is just based on my experience.

(2) I suggest you change your code about variable genes (also based on my exprience):
From
var0 = Reduce(intersect, list(dat0@rowdata$Symbol, dat2@rowdata$Symbol, dat5@rowdata$Symbol, dat14@rowdata$Symbol))
To
var0 = unique(c(dat0@vargene, dat2@vargene, dat5@vargene, dat14@vargene))
length(var0)
if length(var0) around 5000, I will use this.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

clusters overlap on umap #4

clusters overlap on umap #4

Roger-GOAT commented May 17, 2021

bioinfoDZ commented Jun 17, 2021 •

edited

Loading

clusters overlap on umap #4

clusters overlap on umap #4

Comments

Roger-GOAT commented May 17, 2021

bioinfoDZ commented Jun 17, 2021 • edited Loading

bioinfoDZ commented Jun 17, 2021 •

edited

Loading