-
Notifications
You must be signed in to change notification settings - Fork 0
/
DR1clean.R
176 lines (137 loc) · 5.23 KB
/
DR1clean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
setwd("~/git/M9/")
# clean leonardDatMod
cleanDR1 = function() {
dat = read.csv("leonardDatMod_DR1.csv", header=TRUE)
# Notes on data just loaded:
# plusMinus is ±
# lots of ?'s. Maybe just remove those data points for now
# bp: number of calendar years before 1950
# rcybp: radiocarbon years before 1950
# make everything strings instead of factor variables
for(c in 1:ncol(dat)) {
dat[,c] = as.character(dat[,c])
}
# fix index 801 before removing data or 375 after, which has unnecessary semicolons at end
dat[801,7] = "2465-2972"
dat[801,8] = "2650plusMinus90"
# remove 802nd row
dat = dat[-802,]
# set soilNo for empty rows to 1 (107-111)
dat[107:111,6] = "1"
# change the weird "1?" soilNo values to 1
dat[dat[,6] == "1?",6] = "1"
attach(dat)
# remove rows where event is unknown
n = nrow(dat)
##### convert rcybp to 2 columns, a high and a low
# split strings around "pm"
rcybpRoot=strsplit(as.character(rcybp), "plusMinus")
bpRoot=strsplit(as.character(bp), "-")
# get left part of strings (central estimate) and right part (margin of error)
rcybpCntr = sapply(rcybpRoot, "[", 1)
rcybpMOE = sapply(rcybpRoot, "[", 2)
bpL = as.numeric(sapply(bpRoot, "[", 1))
bpR = as.numeric(sapply(bpRoot, "[", 2))
bpCntr = apply(cbind(bpL, bpR), 1, mean)
bpMOE = (bpR - bpL)*0.5
# convert from string to number (don't worry about warnings)
rcybpCntr = as.numeric(rcybpCntr)
rcybpMOE = as.numeric(rcybpMOE)
# these datapoints have max and min instead of center and MOE estimated for some reason
#NOTE: assume max minus central estimate is like a 95% MOE and the central estimate is a good estimate
# maxMinI = c(240, 242) # these are the indices after removing data
maxMinI = c(340, 342) # these are the indices before removing data
rcybpCntr[maxMinI] = c(mean(c(763, 855)), mean(c(1530, 1700)))
rcybpMOE[maxMinI] = c(855-mean(c(763, 855)), 1700-mean(c(1530, 1700)))
##### get ready to fill in blanks of table using this function
# function for filling in blanks in table
fillBlanks = function(varTable, varInds, changeInds) {
# loop to fill in blanks in dataset
for(i in 1:length(varInds)) {
ind = varInds[i]
##### get the set of indices with the same dates as the data at the given index
# first data point
if(i == 1) {
inds = 1:changeInds[1]
}
# last data point
else if(ind > max(changeInds)) {
inds = (changeInds[length(changeInds)]+1):n
}
# general case
else {
indR = which(changeInds >= ind)[1]
indL = indR-1
inds = (changeInds[indL]+1):changeInds[indR]
}
##### fill in the blanks
tmp = t(varTable[inds,])
tmp[,] = varTable[ind,]
varTable[inds,] = t(tmp)
}
return(varTable)
}
##### convert certain numbers to numbers if R assumes they are factor variable
soilNo = as.numeric(as.character(soilNo))
Lat = as.numeric(as.character(Lat))
Uncertainty = as.numeric(as.character(Uncertainty))
##### fill in blanks for lon and lat data using soilNo variable
#for testing purposes
orig = cbind(Lon, Lat)
#compute data indices and when groups change
newLoc = soilNo == 1
locInds = (1:n)[newLoc]
changeInds = locInds[2:length(locInds)]-1
# fill in the blanks
toFillIn = cbind(Lon, Lat)
filledIn = fillBlanks(toFillIn, locInds, changeInds)
Lon = filledIn[,1]
Lat = filledIn[,2]
# check results:
ids = cbind(Region, Site, soilNo, event)
test = cbind(Lon, Lat)
print(cbind(ids[1:30,], test[1:30,], orig[1:30,]))
print(cbind(ids[(n-20):n,], test[(n-20):n,], orig[(n-20):n,]))
##### make new data table
newDat = data.frame(cbind(Region, Site, Lat, Lon, coreNo, soilNo, bpCntr, bpMOE,
rcybpCntr, rcybpMOE, event, subsidence, Uncertainty,
source, quality, method))
##### remove bad data
# remove the "?" event data points (NOTE may change this in the future)
isQu = newDat$event == "?"
newDat = newDat[!isQu,]
# remove the ? subsidence data
isQu = is.na(as.numeric(as.character(newDat$subsidence)))
newDat = newDat[!isQu,]
# remove the NA Uncertainty/subsidence/event data
cols = c(11:13)
isNA = is.na(newDat[,cols])
isNA = apply(isNA, 1, any)
newDat = newDat[!isNA,]
# test newDat:
print(dat[1:30,])
print(newDat[1:30,])
print(dat[(nrow(dat)-30):nrow(dat),])
print(newDat[(nrow(newDat)-30):nrow(newDat),])
##### write file
detach(dat)
rownames(newDat)=NULL
#write as excel file
library(xlsx)
write.xlsx(newDat, file="DR1.xlsx", row.names = FALSE, showNA = FALSE)
#make sure all variable types are right
numericCols = c(3:10, 12:13)
factorCols = c(1:2, 11, 14, 15)
for(c in numericCols) { newDat[,c] = as.numeric(as.character(newDat[,c])) }
for(c in factorCols) { newDat[,c] = as.factor(as.character(newDat[,c])) }
# Lon values are negative of what they should be
newDat[,4] = -newDat[,4]
# Don't remove this event! It's a full or nearly full rupture that is agreed upon existing
# #remove T10R1 event
# newDat = newDat[newDat$event != "T10R1",]
# save as rdata file
dr1 = newDat
save(dr1, file="DR1.RData")
invisible(NULL)
}
cleanDR1()