-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_all_templates.R
172 lines (130 loc) · 5.28 KB
/
create_all_templates.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
##
## Tidyverse packages
##
library(dplyr)
library(purrr)
library(readr)
##
## Other packages
##
library(XLConnect)
library(openxlsx)
basewd = getwd()
ONSPD_path = list.files(file.path(basewd, "data"), full.names = TRUE, pattern = "ONSPD")
documents_path <- file.path(ONSPD_path ,"Documents")
#There should only be one file/folder with ONSPD in the name however if not a warning will be sent
if(length(ONSPD_path)>1) print("Warning more than one ONSPD object defaulting to first in list")
ONSpostcodes_csv <- list.files( file.path(ONSPD_path , "Data"), pattern ="\\.csv", full.names = T)
wards_file <- "lsoa_to_ward.csv"
#The output folder with all the region templates in
Regions <- file.path(basewd, "data" ,"Regions")
select <- dplyr::select
print('Loading data')
#This is a convenience function for loading the lookup tables used by
#the ONSPD
return_lookup_data <- function(folder_path, file_regex_pattern ){
#get the path to the ward names lookup
lookup_path <-list.files(folder_path,
pattern = file_regex_pattern,
ignore.case = TRUE,
full.names = TRUE)
lookup_df <- read_csv(lookup_path)
#Removes the years digits from the column names.
#The lookup tables have the year that the table codes were generated the name
#i.e. on the wardnames lookup from after the 2020 update
#the column names will be "WD20CD" and "WD20NM",
#This Regex removes the numbers meaning that changes in years will not affect the process
#This makes the process more robust.
#However, if the ONS change the naming convention could cause an error
names(lookup_df) = gsub("\\d","",names(lookup_df))
return(lookup_df)
}
wardnames_lookup <- return_lookup_data(documents_path, "^ward names.*(csv)" )
region_lookup <- return_lookup_data(documents_path, "^region.*(csv)" ) %>%
select(1,3)
LADNM_lookup <- return_lookup_data(documents_path, "^la_ua.*(csv)" ) %>%
select(1:2)
CorePstCd <- read_csv(ONSpostcodes_csv) %>%
select(Postcode = pcd, lsoa11cd = lsoa11,
MSOA11CD = msoa11, oslaua,
OA11CD = oa11, Country_code = ctry,
RGNCD = rgn,
WDCD = osward)
{
PstCdLSOA <- CorePstCd %>%
left_join(., region_lookup, by = "RGNCD") %>%
left_join(wardnames_lookup, by = "WDCD") %>%
left_join(LADNM_lookup, by = c("oslaua"= "LADCD")) %>%
#filter(Country_code == "W92000004")
select(Postcode, Country_code, Admin_district_code = oslaua,
Admin_ward_code = WDCD, lsoa11cd, LADNM,
RGNCD, RGNNM, WD16NM = WDNM) %>%
mutate(
#Make sure Scotland and Wales have region names
RGNCD = case_when(
grepl("W", Country_code) ~"Wales",
grepl("S", Country_code) ~"Scotland",
TRUE ~ RGNCD),
RGNNM = case_when(
grepl("W", Country_code) ~"Wales",
grepl("S", Country_code) ~"Scotland",
TRUE ~ RGNNM
),
Postcode = gsub(" ", "", Postcode)) %>% #remove spaces from postcodes
filter(!is.na(RGNNM),
LADNM != "NA",
RGNCD !="Scotland")
#This is a large dataframe so we want to remove it when possible to save memory
rm(CorePstCd)
###
##REGION CLEANING
##A small number of postcodes can slip into different, regionss, this
###is likely an error from postcodes being updated bny the ONS or post office.
##The code below fixes this issue by using a majority vote
###
#this was used previously but started causing errors when only
#ONSPD data started being used
# MajRegion <- PstCdLSOA %>% group_by(lsoa11cd, RGNCD) %>%
# summarise(count = n(),
# RGNNM = first(RGNNM)) %>%
# group_by(lsoa11cd) %>%
# arrange(-count) %>%
# summarise_all(first) %>%
# select(-count)
MajLAD <- PstCdLSOA %>% group_by(lsoa11cd, Admin_district_code) %>%
summarise(count = n()) %>%
group_by(lsoa11cd) %>%
arrange(-count) %>%
summarise_all(first) %>%
select(-count)
PstCdLSOA <- PstCdLSOA %>%
select(-Admin_district_code) %>%
left_join(MajLAD)
}
unique(PstCdLSOA$RGNNM) %>%
walk(~{
title <- paste("Region", .x)
dir.create(file.path(Regions, title ), recursive = TRUE, showWarnings = FALSE)
#The inner loop creates the excel files for each LAD within each region
PstCdLSOA.Reg <- PstCdLSOA %>% filter(RGNNM == .x)
print(title)
unique(PstCdLSOA.Reg$LADNM) %>%
walk(~{
PstCdLSOA.filt <- PstCdLSOA.Reg %>% filter(LADNM == .x)
print(paste(title, ":", .x))
ExemptionsFile <- paste0(.x,"ExemptionsLSOA.xlsx")
DiscountsFile <- paste0(.x,"DiscountsLSOA.xlsx")
setwd(basewd)
wb <- loadWorkbook("Convert Postcode to LSOA.xlsx")
#input the LSOA lookup table
writeData(wb, sheet = "LSOAdata", PstCdLSOA.filt, colNames = T)
#add in randomly sampled postcodes as an example
writeData(wb, sheet = "Input", sample(PstCdLSOA.filt$Postcode, 6),
startCol = 2, startRow = 2,
colNames = F)
setwd(file.path(Regions, title))
saveWorkbook(wb,ExemptionsFile ,overwrite = T)
writeData(wb, "Input", "Discount_Type") #So that it says Discounts not exemptions, so nobody gets confused.
saveWorkbook(wb,DiscountsFile ,overwrite = T)
})
})