forked from fivethirtyeight/data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmost-common-name.R
217 lines (157 loc) · 10.5 KB
/
most-common-name.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#### Story: "Dear Mona, What’s The Most Common Name In America?"
#### Url: http://fivethirtyeight.com/features/whats-the-most-common-name-in-america/
#### Authors: Mona Chalabi (Mona.Chalabi@fivethirtyeight.com) and Andrew Flowers (andrew.flowers@fivethirtyeight.com)
require(babynames)
require(dplyr)
require(reshape2)
require(zoo)
require(datasets)
# Census population parameters
pop2000 <- 276059000 # year 2000 population
pop2013 <- 316128839 # year 2013 population
hispPopShare <- .171 # Hispanic share of overall population
foreignPopShare <- .127 # Foreign-born share of overall population
# Census growth rates by racial categories, 2000-2013
whiteGrowth <- 1.01155164
blackGrowth <- 1.138799977
aianGrowth <- 1.110695106
asianGrowth <- 1.553975166
twoRaceGrowth <- 1.817182595
hispGrowth <- 1.531490233
### Step 1: Actuarial Adjustments
# Create actuarial ajustments by birth year according to SSA probabilities
aging <- read.csv("aging-curve.csv") # taken from SSA: http://www.ssa.gov/oact/NOTES/as120/LifeTables_Tbl_7.html; 2013 is used as base year
agingSmooth <- data.frame(year=c(1900:2013))
agingSmooth$perctAliveMale <- aging[match(agingSmooth$year, aging$Decade),]$Male.1
agingSmooth$perctAliveFemale <- aging[match(agingSmooth$year, aging$Decade),]$Female.1
agingSmooth$perctAliveMale[1:111] <- na.approx(agingSmooth$perctAliveMale)
agingSmooth$perctAliveFemale[1:111] <- na.approx(agingSmooth$perctAliveFemale)
agingSmooth$perctAliveMale[112:114] <- 0.993480 # Assume probabilities are flat since 2010
agingSmooth$perctAliveFemale[112:114] <- 0.994490 # Assume probabilities are flat since 2010
# Estimate stock of first names of those currently living, using actuarial probablities
babyNames <- babynames %>% filter(year>=1900)
babyNames$perctAlive <- ifelse(babyNames$sex=="F", agingSmooth[match(babyNames$year, agingSmooth$year),]$perctAliveFemale, agingSmooth[match(babyNames$year, agingSmooth$year),]$perctAliveMale)
babyNames$nAlive <- babyNames$n * babyNames$perctAlive
stockNames <- babyNames %>% group_by(name) %>%
dplyr::summarize(totalAlive=sum(nAlive)) %>% ungroup() %>%
arrange(desc(totalAlive))
grep("Maria", stockNames$name)[1] # "Maria" is the 97th most common first name (unadjusted)
stockNames$perct2013 <- stockNames$totalAlive/pop2013
#### Step 2: Adjust first names to correct for undercounting of Hispanic names
# Why? Need to scale up Hispanic/Latino names because immigration not factored into SSA data set
# We can make inferences about Hispanic name undercounting by looking at names by state, which the SSA does provide.
######### ONLY RUN THIS ONCE, DATA SAVED IN all-states-babynames.csv #########
# Download state files from here: http://www.ssa.gov/oact/babynames/state/namesbystate.zip
# stateFiles <- grep("\\.TXT", list.files())
# allStates <- data.frame()
# for (i in stateFiles){
# stateData <- read.table(list.files()[i], sep=",")
# allStates <- rbind(allStates, stateData)
# }
# names(allStates) <- c("state", "sex", "year", "name", "n")
# write.csv(allStates, "all-states-babynames.csv")
######### ONLY RUN THIS ONCE, DATA SAVED IN all-states-babynames.csv #########
allStates <- read.csv("all-states-babynames.csv", stringsAsFactors=F) # Huge file, created from code above
allStates$perctAlive <- ifelse(allStates$sex=="F", agingSmooth[match(allStates$year, agingSmooth$year),]$perctAliveFemale, agingSmooth[match(babyNames$year, agingSmooth$year),]$perctAliveMale)
allStates$nAlive <- allStates$n * allStates$perctAlive
stateNames <- allStates %>% group_by(name, state) %>%
dplyr::summarize(totalAlive=sum(nAlive)) %>% ungroup() %>%
arrange(desc(totalAlive))
# Import state's hispanic population
statePop <- read.csv("state-pop.csv", stringsAsFactors=F)
statePop$totalPop <- gsub( ",", "", statePop$totalPop)
statePop$hispPop <- gsub( ",", "", statePop$hispPop)
statePop$hispPerct <- as.numeric(statePop$hispPop)/as.numeric(statePop$totalPop)
statePop$state <- state.abb[match(statePop$state, state.name)]
statePop$state[9] <- "DC" # Correct D.C. being coded as "NA"
# Add Hispanic population percentage and state population to each name/state combo
stateNames$hispPerct <- statePop[match(stateNames$state, statePop$state),]$hispPerct
stateNames$statePop <- statePop[match(stateNames$state, statePop$state),]$totalPop
# Calculate frequency of that name in that state, nation-wide, and difference between the two
stateNames$stateFreq <- as.numeric(stateNames$totalAlive) / as.numeric(stateNames$statePop)
stateNames$natFreq <- stockNames[match(stateNames$name, stockNames$name),]$perct2013
stateNames$diffFreq <- stateNames$stateFreq - stateNames$natFreq
# Group name/state combos into one national total, and take top 1000 names
natNamesByState <- stateNames %>% group_by(name) %>% summarize(n=sum(totalAlive)) %>% arrange(desc(n))
topNames <- natNamesByState[1:1000,]$name
# Regress each state's Hispanic population on the frequency each first name appears in the state.
coefs <- data.frame()
# Formula: stateFrequency ~ intercept + hispPerct, weighted by the square-root of the state's population
for(i in 1:length(topNames)){
name <- paste0("^", topNames[i], "$")
m <- lm(data=stateNames[grep(name, stateNames$name),], formula=stateFreq~hispPerct, weights=sqrt(as.numeric(statePop)))
coefs <- rbind(coefs, coef(m))
}
names(coefs) <- c("intercept", "hispPerct")
coefs$name <- topNames
coefs$correction <- (coefs$intercept + coefs$hispPerct*hispPopShare)/(coefs$intercept + coefs$hispPerct*foreignPopShare)
hispNameCorrex <- coefs %>% arrange(desc(correction))
# Cap Hispanic name correction on top and bottom
hispNameCorrex$correction <- ifelse(hispNameCorrex$correction > (hispPopShare/foreignPopShare), (hispPopShare/foreignPopShare), hispNameCorrex$correction)
hispNameCorrex$correction <- ifelse(hispNameCorrex$correction < (1-hispPopShare)/(1-foreignPopShare), (1-hispPopShare)/(1-foreignPopShare), hispNameCorrex$correction)
stockNames$correction <- hispNameCorrex[match(stockNames$name, hispNameCorrex$name),]$correction
stockNames$correction <- ifelse(is.na(stockNames$correction), 1.0, stockNames$correction)
stockNames$newTotal <- stockNames$total * stockNames$correction
stockNames$newPerct2013 <- stockNames$newTotal / pop2013
stockNames2 <- stockNames %>% arrange(desc(newPerct2013))
#### Step 3: Surnames analysis
# Adjust up surnames by their racial shares percent
surnames <- read.csv("surnames.csv", stringsAsFactors=F)
# Growth rates taken from 2000 to 2013; parameters are at top of script
surnames$hispCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pcthispanic)/100)) * hispGrowth
surnames$whiteCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pctwhite)/100)) * whiteGrowth
surnames$blackCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pctblack)/100)) * blackGrowth
surnames$asianCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pctapi)/100)) * asianGrowth
surnames$aianCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pctaian)/100)) * aianGrowth
surnames$twoRaceCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pct2prace)/100)) * twoRaceGrowth
surnames$Count2013 <- surnames$hispCount2013 + surnames$whiteCount2013 + surnames$blackCount2013 + surnames$asianCount2013 + surnames$aianCount2013 + surnames$twoRaceCount2013
surnames$perct2013 <- surnames$Count2013/pop2013
surnames <- surnames %>% arrange(desc(perct2013))
# Filter out top 100 surnames and first names
topFirstName <- stockNames2[1:100,] %>% select(name, newPerct2013)
topSurnames <- surnames[1:100,] %>% select(name, perct2013)
# Create empty matrix of top 100 surnames and first names
nameMatches <- data.frame(matrix(NA, nrow = 100, ncol = 100))
row.names(nameMatches) <- topFirstName$name
names(nameMatches) <- topSurnames$name
# Calculate "independent" probabilities
for (i in 1:100){
nameMatches[,i] <- topFirstName$newPerct2013 * topSurnames[i,]$perct2013
}
nameMatches2 <- nameMatches * pop2013 # Scale matrix by 2013 population
# Melt independent name name matches
longNames <- melt(nameMatches2, measure.vars=1:100)
longNames$firstName <- rep(topFirstName$name, 100)
longNames <- longNames %>% select(firstName, variable, value) %>% arrange(desc(value))
names(longNames) <- c("FirstName", "Surname", "EstimatedNumber")
# Read in Hartman's adjustments table, and melt it
# Source for the data in the 'adjustments.csv' is here: http://mypage.siu.edu/lhartman/johnsmith.html
adjustments <- read.csv("adjustments.csv")
adjustments2 <- melt(adjustments, measure.vars=2:21)
names(adjustments2) <- c("FirstName", "Surname", "Adjustment")
# Clean full names, attach estimated number (by pure independent odds)
longNames$cleanName <- paste(tolower(longNames$FirstName), tolower(longNames$Surname), sep=" ")
adjustments2$cleanName <- paste(tolower(adjustments2$FirstName), tolower(adjustments2$Surname), sep=" ")
adjustments2$Estimate <- longNames[match(adjustments2$cleanName, longNames$cleanName),]$EstimatedNumber
adjustedTotals <- adjustments2 %>% arrange(desc(Estimate))
adjustedTotals$finalEstimate <- adjustedTotals$Estimate * (1+ (adjustedTotals$Adjustment/100))
adjustedTotals <- adjustedTotals %>% arrange(desc(finalEstimate))
adjustedMatrix <- dcast(adjustedTotals, formula=FirstName~Surname)
longNames$finalEstimate <- adjustedTotals[match(longNames$cleanName, adjustedTotals$cleanName),]$finalEstimate
finalMatrix <- dcast(longNames, formula=FirstName~Surname)
tableNames <- row.names(nameMatches)[1:20]
finalMatrix2 <- finalMatrix[match(tableNames, finalMatrix$FirstName),]
advMatrixFinal <- finalMatrix2[,1:21]
#### Numbers to check in piece
sum(stockNames2$newPerct2013) # first names cover 80% of population
sum(stockNames2$newTotal) # first names cover 255 million people
grep("maria garcia", adjustments2$cleanName) # Maria Garcia (unadjusted) is the 354th most common name
grep("maria smith", adjustments2$cleanName) # Maria Smith (unadjusted) is the 74th most common name
surnames[grep("GEORGE", surnames$name)[1],]$perct2013 * stockNames2[grep("Kieron", stockNames2$name)[1],]$newPerct2013 * pop2013 # "Kieron George" odds
surnames[grep("LOTT", surnames$name)[1],]$count # Number of Americans with last name "Lott", should be 25118
#### Files to export
write.csv(topFirstName, "new-top-firstNames.csv")
write.csv(topSurnames, "new-top-surnames.csv")
write.csv(nameMatches2, "independent-name-combinations-by-pop.csv")
write.csv(advMatrixFinal, "adjusted-name-combinations-matrix.csv")
write.csv(adjustedTotals, "adjusted-name-combinations-list.csv")