-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering.R
92 lines (76 loc) · 3.75 KB
/
clustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
setwd("~/git/qs-survey")
load("qs-data-2014-03-31.RData")
require(Hmisc)
require(descr)
require(foreign)
require(plyr)
require(clusterCons)
source("/Users/Ulrich/git/R-projects/consensusclustering/cluscomp_pb_mach.r")
source("/Users/Ulrich/git/R-projects/consensusclustering/cc_robustness.R")
set.seed(4994)
# Make respid the rowlabels for CC, input data for allocation algorithm
rmclus <- data.frame(sapply(personas, as.numeric))
rmclus$respid <- 1:105
rownames(rmclus) <- rmclus$respid
rmclus[,1] <- NULL
# Recommended are 1000 replications; time-consuming.
system.time(
dataseg <- cluscomp_mach(rmclus, diss = FALSE, algorithms = list("kmeans"),
alparams = list(iter.max = 50), clmin = 3, clmax = 6, prop = 0.8,
reps = 1000)
)
# Calculate robustness of each segment solution via the clusterCons package.
# Result is called 'clustrobexp'
cc_robust_package(3, 6, dataseg)
# Alternative way with highest and number of re-allocated segment membership
# Takes some time to calculate individual membership robustness 'memrob'
# Not the ideal way because 'clusterallocation' gets overwritten
# To save the result specify: segmentX <- clusterallocation
#
# 'cc_robust_cust' gives you the alternative robustness statistic AND
# the number of respondents which are allocated into different segments.
cc_robust_cust(3, dataseg$e1_kmeans_k3)
segment3 <- clusterallocation
cc_robust_cust(4, dataseg$e1_kmeans_k4)
segment4 <- clusterallocation
cc_robust_cust(5, dataseg$e1_kmeans_k5)
segment5 <- clusterallocation
cc_robust_cust(6, dataseg$e1_kmeans_k6)
segment6 <- clusterallocation
all_clusters <- data.frame(rmclus$respid, segment3, segment4, segment5, segment6)
write.csv(all_clusters, "Segments allocation 3-6.csv")
# Optional: Compare segmentation solution.
colours <- c(odi_dBlue, odi_dPink, odi_purple, odi_orange)
crosstab(all_clusters[, 3], all_clusters[, 2], xlab = "4 segments", ylab = "3 segments", color = colours)
crosstab(all_clusters[, 4], all_clusters[, 3], xlab = "5 segments", ylab = "4 segments", color = colours)
crosstab(all_clusters[, 5], all_clusters[, 3], xlab = "6 segments", ylab = "4 segments", color = colours)
# heatmaps - takes very long to run
jpeg("graphics/heatmap 3 seg.jpg", width = 800, height = 800)
system.time(heatmap(dataseg$e1_kmeans_k3@cm, labRow="", labCol=""))
dev.off()
jpeg("graphics/heatmap 4 seg.jpg", width = 800, height = 800)
system.time(heatmap(dataseg$e1_kmeans_k4@cm, labRow="", labCol=""))
dev.off()
jpeg("graphics/heatmap 5 seg.jpg", width = 800, height = 800)
system.time(heatmap(dataseg$e1_kmeans_k5@cm, labRow="", labCol=""))
dev.off()
jpeg("graphics/heatmap 6 seg.jpg", width = 800, height = 800)
system.time(heatmap(dataseg$e1_kmeans_k6@cm, labRow="", labCol=""))
dev.off()
#------------------------------
# Add cluster allocation to data
personas.cluster <- cbind(all_clusters[, -1], personas)
names(personas.cluster)[names(personas.cluster)=="max.col.mrm."] <- "seg3"
names(personas.cluster)[names(personas.cluster)=="max.col.mrm..1"] <- "seg4"
names(personas.cluster)[names(personas.cluster)=="max.col.mrm..2"] <- "seg5"
names(personas.cluster)[names(personas.cluster)=="max.col.mrm..3"] <- "seg6"
# Select factors
is.fact <- sapply(personas.cluster, is.factor)
str(personas.cluster[, is.fact])
is.not.fact <- !sapply(personas.cluster, is.factor)
ddply(personas.cluster[, is.not.fact], ~ seg4, summarise, mean)
library(psych)
groups <- describeBy(personas.cluster, personas.cluster$seg4, mat = TRUE)
write.csv(as.data.frame(groups), "groups-by-seg4.csv")
groups <- describeBy(personas.cluster, personas.cluster$seg3, mat = TRUE)
write.csv(as.data.frame(groups), "groups-by-seg3.csv")