-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClean_Data_Input_Code.R
129 lines (96 loc) · 3.87 KB
/
Clean_Data_Input_Code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
setwd('~/Data-Science/Project-work/Lychee-Project/')
library(WGCNA)
options(stringsAsFactors = FALSE)
smlseed <- read.csv('Small_seeded_HSP.csv')
lrgseed <- read.csv('Large_seeded_HSP.csv')
names(smlseed) <- c("Contig", "Day6", "Day14", "Day0")
names(lrgseed) <- c("Contig", "Day6", "Day14", "Day0")
smlseed2 <- t(smlseed)
lrgseed2 <- t(t(lrgseed))
## For a special list
#gene_list <- read.table('Gene_list.txt', header = FALSE, sep = '\n')
#gene_list <- as.vector(gene_list, mode = 'character')
#gene_index <- match(gene_list[, 1], smlseed$Contig)
#smlseed <- smlseed[gene_index, ]
#lrgseed <- lrgseed[gene_index, ]
## Making multiset Data
nsets <- 2
seedlabels <- c('small_seed', 'large_seed')
multiExp <- vector(mode = "list", length=nsets)
multiExp[[1]] <- list(data = as.data.frame(t(smlseed[-1])))
names(multiExp[[1]]$data) <- smlseed$Contig
rownames(multiExp[[1]]$data) <- names(smlseed)[-1]
multiExp[[2]] <- list(data = as.data.frame(t(lrgseed[-1])))
names(multiExp[[2]]$data) <- lrgseed$Contig
rownames(multiExp[[2]]$data) <- names(lrgseed)[-1]
## Taking log of expression data in order to normalize it
library(doParallel)
cl <- makeCluster(3)
registerDoParallel(cl)
multiExp[[1]]$data <- log2(multiExp[[1]]$data + 1)
multiExp[[2]]$data <- log2(multiExp[[2]]$data + 1)
#Calculation of variance of Genes across samples
var.small <- vector(mode = "numeric", length = dim(multiExp[[1]]$data)[2])
var.large <- vector(mode = "numeric", length = dim(multiExp[[2]]$data)[2])
for(i in 1:dim(multiExp[[1]]$data)[2]) {
var.small[i] <- var(multiExp[[1]]$data[, i])
var.large[i] <- var(multiExp[[2]]$data[, i])
}
var.small.sorted <- sort(var.small, decreasing = TRUE)
var.large.sorted <- sort(var.large, decreasing = TRUE)
stopCluster(cl)
contig_to_keep <- match(var.small.sorted[1:8000], var.small)
multiExp[[1]]$data <- multiExp[[1]]$data[, contig_to_keep]
multiExp[[2]]$data <- multiExp[[2]]$data[, contig_to_keep]
#threshold.variance <- 15.0
#high.varGenes <- (var.small > threshold.variance) | (var.large > threshold.variance)
## Retaining only those genes who have high variance
#multiExp[[1]]$data <- multiExp[[1]]$data[, high.varGenes]
#multiExp[[2]]$data <- multiExp[[2]]$data[, high.varGenes]
# ### Checking Genes whose counts are less than 5 across all samples
# noiseGenesml <- vector(mode = "logical", length = dim(smlseed2)[1])
# noiseGenelrg <- vector(mode = "logical", length = dim(lrgseed2)[1])
# for(i in 1:dim(smlseed2)[1]) {
# noiseGenesml[i] <- max(smlseed2[i, c(-1)]) < 5
# noiseGenelrg[i] <- max(lrgseed2[i, c(-1)]) < 5
#
# }
# table(noiseGenesml == noiseGenelrg)
# NoiseGenes.Consensus <- (noiseGenesml & noiseGenelrg)
#
# for(i in 1:nsets) {
# multiExp[[i]]$data <- multiExp[[i]]$data[, !NoiseGenes.Consensus]
# }
#
#
#
#
# var.vector.small <- vector(mode = "numeric", length = dim(multiExp[[1]]$data)[2])
# var.vector.large <- vector(mode = "numeric", length = dim(multiExp[[2]]$data)[2])
# for(i in 1:dim(multiExp[[1]]$data)[2]) {
# var.vector.small[i] <- var(multiExp[[1]]$data[, i])
# var.vector.large[i] <- var(multiExp[[2]]$data[, i])
#
# }
## Checking if multiExp has correct multiset structure
exprSize <- checkSets(multiExp)
exprSize
#gsg = goodSamplesGenesMS(multiExpr, verbose = 5)
#gsg$allOK
## Gene Clustering using average linkage hierarchical clustering
seedsampletrees <- list()
for (set in 1:exprSize$nSets) {
seedsampletrees[[set]] <- hclust(dist(multiExp[[set]]$data), method = "average")
}
##Plotting the results
pdf(file = "Seedsampleclustering.pdf", height = 12, width = 12)
par(mfrow=c(2,1))
par(mar = c(0, 4, 2, 0))
for (set in 1:nsets)
plot(seedsampletrees[[set]], main = paste("Sample clustering across all genes in",
seedlabels[set]), xlab="", sub="", cex = 0.7)
dev.off()
nGenes <- exprSize$nGenes
nSamples <- exprSize$nSamples
save(multiExp, nGenes, nSamples, nsets, seedlabels, exprSize,
file = "Lychee_Consensus_Input_Data.Rdata")