-
Notifications
You must be signed in to change notification settings - Fork 0
/
match-municips-and-regions.R
61 lines (44 loc) · 1.59 KB
/
match-municips-and-regions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
library(dplyr)
A <- read.csv("mun1.csv",header=TRUE)
B <- read.csv("regioni-opstini.csv", header=TRUE)
colnames(B) <- c("REG", "MUN")
A$MUN <- toupper(A$MUN)
B$MUN <- toupper(B$MUN)
nrow(A)
nrow(B)
head(A)
head(B)
unA <- unique(A$MUN) %>% sort
unB <- unique(B$MUN) %>% sort
Match <- unA %in% unB
unA[!Match]
C <- inner_join(A, B, by="MUN")
C$AGE <- 2016-C$YoB
tail(C)
glimpse(C)
nrow(C)
write.csv(C, file="Voters.csv")
## find out which municipalites have biggest % of young problematic entries
teens <- C %>% filter(AGE %in% 18:22) %>% group_by(REG, MUN) %>% summarise_each(funs(length(.)), AGE)
all <- C %>% group_by(REG, MUN) %>% summarise_each(funs(length(.)), AGE)
Match1 <- all$MUN %in% teens$MUN
all2 <- all[Match1,] # subset the rows based on true/false in Match2
##percent
## doesn't matter which variable is used, as the length
#(number of voters in each municipality) is the same for all variables
teens$Percent <- 100/(all2$AGE/teens$AGE)
teens <- teens %>% ungroup %>% arrange(desc(Percent))
# wrapper function to do the above for arbitrary age range
getPercentOfAge <- function(DataFrame=C, AgeRange=c(18:22)) {
library("dplyr")
teens <- DataFrame %>% filter(AGE %in% AgeRange) %>% group_by(REG, MUN) %>% summarise_each(funs(length(.)))
all <- DataFrame %>% group_by(REG, MUN) %>% summarise_each(funs(length(.)))
Match1 <- all$MUN %in% teens$MUN
all2 <- all[Match1,]
teens$Percent <- 100/(all2$AGE/teens$AGE)
teens$Total <- all2$AGE
teens <- teens %>% ungroup %>%
arrange(desc(Percent))
return(teens[,-(3:4)])
}
getPercentOfAge(DataFrame = C, AgeRange = 18:103)