-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetzeroweightwugs.R
56 lines (43 loc) · 1.69 KB
/
getzeroweightwugs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
UCLAPL.output <- "~/Work/UCLAPL/finnish/output-wugs/blickTestResults.txt"
all.wugs <- read.table(UCLAPL.output, sep="\t", header=TRUE)
all.wugs <- all.wugs[3:nrow(all.wugs), c("word","score")]
all.wugs$score <- as.numeric(as.character(all.wugs$score))
summary(all.wugs)
zeroes <- all.wugs[all.wugs$score==0,]
n <- nrow(all.wugs)
nzero <- nrow(zeroes)
print(paste0("Total wugs = ", n, ", total 0's = ", nzero, "; ", nzero/n*100, "%"))
## Let's annotate here, not in hanalyze1 :((
zeroes$pattern <- factor("none",levels=c("none","1","2","3","4","5"))
zeroes[grepl("[aeiou] j [aeiou]",zeroes$word),]$pattern <- "1"
zeroes[grepl("[aeiou] [aeiou]",zeroes$word),]$pattern <- "2" ## ouchie
zeroes[grepl(" i[ei]? pp? [aeiou]",zeroes$word),]$pattern <- "3"
zeroes[grepl("[aeiou] ll? [aeiou]",zeroes$word),]$pattern <- "4"
zeroes[grepl("[aeiou] [fsh] j [aeiou]",zeroes$word),]$pattern <- "5"
for(i in levels(zeroes$pattern)){
print(paste0("Pattern ", i, ": ", nrow(zeroes[zeroes$pattern==i,])))
}
## filter for existence
## let's sample some wugs.
# TOTAL will be 160
# 20 by pattern, 60 none?
create.sample = function(data, n.byPattern = 20, n.none = 60){
nones = subset(data, pattern == "none")
sampled = nones[sample(nrow(nones), n.none),]
for(i in levels(data$pattern)){
if (i != "none"){
d = subset(data, pattern == i)
s = d[sample(nrow(d), n.byPattern),]
sampled = rbind(sampled, s)
}
}
sampled
}
remove.spaces = function (x){
rv <- x
rv$word <- gsub(" ","", x$word)
rv
}
new.sample <- create.sample(zeroes)
unspaced <- remove.spaces(new.sample)
write.table(new.sample, "wugs-to-test.csv", quote = FALSE, sep = ",", row.names = FALSE)