-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSample Cleaning Script from Lecture 4_17_18.R
152 lines (97 loc) · 3.08 KB
/
Sample Cleaning Script from Lecture 4_17_18.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
install.packages("openxlsx")
library(openxlsx)
##1
#Import Data
data<-read.xlsx("https://raw.githubusercontent.com/Cghlewis/Sample-files/master/XLSX two row.xlsx
",sheet=1, startRow =2, colNames=TRUE)
str(data)
data2<-read.xlsx("https://raw.githubusercontent.com/Cghlewis/Sample-files/master/XLSX two row.xlsx
",sheet=2, startRow =1, colNames=TRUE)
str(data2)
##2
#Merge in ID
Merged<-merge(data,data2,by=c("TeacherName"), all=TRUE)
#Delete name
Merged$TeacherName<-NULL
##3
#Check for duplicates
install.packages("dplyr")
library(dplyr)
which(duplicated(Merged$TeacherID))
#Remove the duplicate
Merged<-Merged[-c(19),]
##4
#Drop unnecessary variables
Merged$V10<-NULL
Merged$Q1<-NULL
Merged$Q3<-NULL
##5
#Rename variables and add labels if necessary
#Rename variables
install.packages("plyr")
library(plyr)
Merged<-rename(Merged, c("Q4"="SchoolID", "Q5"="Age", "Q6"="Gender", "Q7"="Race", "Q7_TEXT"="Race_Text",
"Q8"="Grade"))
head(Merged)
#Add variable labels
install.packages("Hmisc")
library(Hmisc)
label(Merged$Age) <- "Age at time of survey"
describe(Merged)
##6
#Add value labels for nominal/factor variables
Merged$SchoolID<-factor(Merged$SchoolID,levels=c(1,2,3,4),labels=c("SchoolA","SchoolB","SchoolC","SchoolD"))
describe(Merged)
##7
#Recode any variables if necessary
Merged$Q18_1new<-((max(Merged$Q18_1)+1)-Merged$Q18_1)
#Check to see if recode worked
table(Merged$Q18_1, Merged$Q18_1new)
#Recode Gender
install.packages("car")
library(car)
Merged$Female<-recode(Merged$Gender,"1=0;2=1")
#Check to see if recode worked
table(Merged$Female, Merged$Gender)
#Make our date an actual date
Merged$StartDate2<-convertToDate(Merged$StartDate)
Merged$EndDate2<-convertToDate(Merged$EndDate)
#Check to see if it worked
str(Merged)
#Drop all original variables
Merged$StartDate<-NULL
Merged$EndDate<-NULL
Merged$Gender<-NULL
Merged$Q18_1<-NULL
##8
#Create measure scores (na.rm=TRUE)
Merged$Stresssum <- Merged$Q18_1new + Merged$Q18_2 + Merged$Q18_3 + Merged$Q18_4 +
Merged$Q18_5
##9
#We don't actually have any missing data
#If we wanted to make missing -999 then do
Merged$Q18_1new[Merged$Q18_1new=='NA']<--999
##Number of cases
nrow(Merged)
#Number of cases by number of variables
dim(Merged)
##Check descriptives
mean(Merged$Stresssum)
sd(Merged$Stresssum)
range(Merged$Stresssum)
install.packages("psych")
library(psych)
describe(Merged)
##Check for missing data (Tweak this to skip over string variables)
Merged[!complete.cases(Merged),]
library(Hmisc)
describe(Merged)
##Export
#write adds in an extra column b/c r gives row names so say row names false
#missing values have N/A
write.csv(Merged, file="C:/Users/hamptoncg/Desktop/R/Merged2.csv", row.names=F)
#Or can use a package
install.packages("rio")
library(rio)
#missing values are blank
export(Merged, file="C:/Users/hamptoncg/Desktop/R/Merged3.csv")