most-common-name/most-common-name.R

#### Story: "Dear Mona, What’s The Most Common Name In America?"
#### Url: http://fivethirtyeight.com/features/whats-the-most-common-name-in-america/ 
#### Authors: Mona Chalabi (Mona.Chalabi@fivethirtyeight.com) and Andrew Flowers (andrew.flowers@fivethirtyeight.com)

require(babynames)
require(dplyr)
require(reshape2)
require(zoo)
require(datasets)

# Census population parameters
pop2000 <- 276059000 # year 2000 population
pop2013 <- 316128839 # year 2013 population

hispPopShare <- .171 # Hispanic share of overall population
foreignPopShare <- .127 # Foreign-born share of overall population

# Census growth rates by racial categories, 2000-2013
whiteGrowth  <- 1.01155164
blackGrowth  <- 1.138799977
aianGrowth  <- 1.110695106
asianGrowth  <- 1.553975166
twoRaceGrowth  <-  1.817182595
hispGrowth <- 1.531490233

### Step 1: Actuarial Adjustments
# Create actuarial ajustments by birth year according to SSA probabilities

aging  <- read.csv("aging-curve.csv") # taken from SSA: http://www.ssa.gov/oact/NOTES/as120/LifeTables_Tbl_7.html; 2013 is used as base year

agingSmooth  <- data.frame(year=c(1900:2013))

agingSmooth$perctAliveMale  <- aging[match(agingSmooth$year, aging$Decade),]$Male.1
agingSmooth$perctAliveFemale  <- aging[match(agingSmooth$year, aging$Decade),]$Female.1

agingSmooth$perctAliveMale[1:111] <- na.approx(agingSmooth$perctAliveMale)
agingSmooth$perctAliveFemale[1:111] <- na.approx(agingSmooth$perctAliveFemale)

agingSmooth$perctAliveMale[112:114] <- 0.993480 # Assume probabilities are flat since 2010
agingSmooth$perctAliveFemale[112:114] <- 0.994490 # Assume probabilities are flat since 2010

# Estimate stock of first names of those currently living, using actuarial probablities

babyNames  <- babynames  %>% filter(year>=1900)

babyNames$perctAlive  <- ifelse(babyNames$sex=="F", agingSmooth[match(babyNames$year, agingSmooth$year),]$perctAliveFemale, agingSmooth[match(babyNames$year, agingSmooth$year),]$perctAliveMale)
babyNames$nAlive <- babyNames$n * babyNames$perctAlive

stockNames  <- babyNames  %>% group_by(name) %>% 
    dplyr::summarize(totalAlive=sum(nAlive)) %>% ungroup() %>%
    arrange(desc(totalAlive))

grep("Maria", stockNames$name)[1] # "Maria" is the 97th most common first name (unadjusted)

stockNames$perct2013  <- stockNames$totalAlive/pop2013

#### Step 2: Adjust first names to correct for undercounting of Hispanic names

# Why? Need to scale up Hispanic/Latino names because immigration not factored into SSA data set
# We can make inferences about Hispanic name undercounting by looking at names by state, which the SSA does provide.

######### ONLY RUN THIS ONCE, DATA SAVED IN all-states-babynames.csv #########
    # Download state files from here: http://www.ssa.gov/oact/babynames/state/namesbystate.zip
    # stateFiles  <- grep("\\.TXT", list.files())
    # allStates  <- data.frame()
    # for (i in stateFiles){
    #     stateData  <- read.table(list.files()[i], sep=",")
    #     allStates <- rbind(allStates, stateData)
    # }
    # names(allStates)  <- c("state", "sex", "year", "name", "n")
    # write.csv(allStates, "all-states-babynames.csv")
######### ONLY RUN THIS ONCE, DATA SAVED IN all-states-babynames.csv #########

allStates  <- read.csv("all-states-babynames.csv", stringsAsFactors=F) # Huge file, created from code above
allStates$perctAlive  <- ifelse(allStates$sex=="F", agingSmooth[match(allStates$year, agingSmooth$year),]$perctAliveFemale, agingSmooth[match(babyNames$year, agingSmooth$year),]$perctAliveMale)
allStates$nAlive <- allStates$n * allStates$perctAlive

stateNames  <- allStates %>% group_by(name, state) %>% 
    dplyr::summarize(totalAlive=sum(nAlive)) %>% ungroup() %>%
    arrange(desc(totalAlive))

# Import state's hispanic population
statePop  <- read.csv("state-pop.csv", stringsAsFactors=F)
statePop$totalPop  <-  gsub( ",", "", statePop$totalPop)
statePop$hispPop  <-  gsub( ",", "", statePop$hispPop)
statePop$hispPerct  <- as.numeric(statePop$hispPop)/as.numeric(statePop$totalPop)
statePop$state  <- state.abb[match(statePop$state, state.name)]
statePop$state[9]  <- "DC" # Correct D.C. being coded as "NA"

# Add Hispanic population percentage and state population to each name/state combo
stateNames$hispPerct  <- statePop[match(stateNames$state, statePop$state),]$hispPerct
stateNames$statePop  <- statePop[match(stateNames$state, statePop$state),]$totalPop

# Calculate frequency of that name in that state, nation-wide, and difference between the two
stateNames$stateFreq <- as.numeric(stateNames$totalAlive) /  as.numeric(stateNames$statePop)
stateNames$natFreq <- stockNames[match(stateNames$name, stockNames$name),]$perct2013
stateNames$diffFreq <- stateNames$stateFreq - stateNames$natFreq 

# Group name/state combos into one national total, and take top 1000 names
natNamesByState  <- stateNames %>% group_by(name) %>% summarize(n=sum(totalAlive)) %>% arrange(desc(n))
topNames  <- natNamesByState[1:1000,]$name

# Regress each state's Hispanic population on the frequency each first name appears in the state.
coefs  <- data.frame()

# Formula: stateFrequency ~ intercept + hispPerct, weighted by the square-root of the state's population
for(i in 1:length(topNames)){
    name <- paste0("^", topNames[i], "$")
    m <- lm(data=stateNames[grep(name, stateNames$name),], formula=stateFreq~hispPerct, weights=sqrt(as.numeric(statePop)))
    coefs  <- rbind(coefs, coef(m))
}

names(coefs) <- c("intercept", "hispPerct")
coefs$name <- topNames
coefs$correction  <- (coefs$intercept + coefs$hispPerct*hispPopShare)/(coefs$intercept + coefs$hispPerct*foreignPopShare)
hispNameCorrex  <- coefs %>% arrange(desc(correction))

# Cap Hispanic name correction on top and bottom 
hispNameCorrex$correction  <- ifelse(hispNameCorrex$correction > (hispPopShare/foreignPopShare), (hispPopShare/foreignPopShare), hispNameCorrex$correction)
hispNameCorrex$correction  <- ifelse(hispNameCorrex$correction < (1-hispPopShare)/(1-foreignPopShare), (1-hispPopShare)/(1-foreignPopShare), hispNameCorrex$correction)

stockNames$correction  <- hispNameCorrex[match(stockNames$name, hispNameCorrex$name),]$correction
stockNames$correction  <- ifelse(is.na(stockNames$correction), 1.0, stockNames$correction)
stockNames$newTotal  <- stockNames$total * stockNames$correction

stockNames$newPerct2013  <- stockNames$newTotal / pop2013

stockNames2  <- stockNames %>% arrange(desc(newPerct2013))

#### Step 3: Surnames analysis

# Adjust up surnames by their racial shares percent
surnames  <- read.csv("surnames.csv", stringsAsFactors=F)

# Growth rates taken from 2000 to 2013; parameters are at top of script
surnames$hispCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pcthispanic)/100)) * hispGrowth
surnames$whiteCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pctwhite)/100)) * whiteGrowth
surnames$blackCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pctblack)/100)) * blackGrowth
surnames$asianCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pctapi)/100)) * asianGrowth
surnames$aianCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pctaian)/100)) * aianGrowth
surnames$twoRaceCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pct2prace)/100)) * twoRaceGrowth
surnames$Count2013  <- surnames$hispCount2013 + surnames$whiteCount2013 + surnames$blackCount2013 + surnames$asianCount2013  + surnames$aianCount2013 + surnames$twoRaceCount2013
surnames$perct2013  <- surnames$Count2013/pop2013
surnames  <- surnames %>% arrange(desc(perct2013)) 

# Filter out top 100 surnames and first names
topFirstName  <- stockNames2[1:100,] %>% select(name, newPerct2013)
topSurnames  <- surnames[1:100,] %>% select(name, perct2013)

# Create empty matrix of top 100 surnames and first names 

nameMatches  <- data.frame(matrix(NA, nrow = 100, ncol = 100))
row.names(nameMatches)  <- topFirstName$name
names(nameMatches)  <- topSurnames$name

# Calculate "independent" probabilities

for (i in 1:100){
    nameMatches[,i] <- topFirstName$newPerct2013 * topSurnames[i,]$perct2013
}

nameMatches2  <- nameMatches * pop2013 # Scale matrix by 2013 population

# Melt independent name name matches

longNames  <- melt(nameMatches2, measure.vars=1:100)
longNames$firstName <- rep(topFirstName$name, 100)
longNames <- longNames  %>% select(firstName, variable, value)  %>% arrange(desc(value))
names(longNames)  <- c("FirstName", "Surname", "EstimatedNumber")

# Read in Hartman's adjustments table, and melt it
# Source for the data in the 'adjustments.csv' is here: http://mypage.siu.edu/lhartman/johnsmith.html

adjustments  <- read.csv("adjustments.csv")
adjustments2 <- melt(adjustments, measure.vars=2:21)
names(adjustments2)  <- c("FirstName", "Surname", "Adjustment")

# Clean full names, attach estimated number (by pure independent odds) 

longNames$cleanName  <- paste(tolower(longNames$FirstName), tolower(longNames$Surname), sep=" ")
adjustments2$cleanName  <- paste(tolower(adjustments2$FirstName), tolower(adjustments2$Surname), sep=" ")
adjustments2$Estimate  <- longNames[match(adjustments2$cleanName, longNames$cleanName),]$EstimatedNumber

adjustedTotals  <- adjustments2 %>% arrange(desc(Estimate))
adjustedTotals$finalEstimate  <- adjustedTotals$Estimate * (1+ (adjustedTotals$Adjustment/100))
adjustedTotals  <- adjustedTotals %>% arrange(desc(finalEstimate))

adjustedMatrix  <- dcast(adjustedTotals, formula=FirstName~Surname)

longNames$finalEstimate <- adjustedTotals[match(longNames$cleanName, adjustedTotals$cleanName),]$finalEstimate
finalMatrix  <- dcast(longNames, formula=FirstName~Surname)

tableNames  <- row.names(nameMatches)[1:20]

finalMatrix2  <- finalMatrix[match(tableNames, finalMatrix$FirstName),]
advMatrixFinal  <- finalMatrix2[,1:21]

#### Numbers to check in piece
sum(stockNames2$newPerct2013) # first names cover 80% of population
sum(stockNames2$newTotal) # first names cover 255 million people

grep("maria garcia", adjustments2$cleanName) # Maria Garcia (unadjusted) is the 354th most common name
grep("maria smith", adjustments2$cleanName) # Maria Smith (unadjusted) is the 74th most common name

surnames[grep("GEORGE", surnames$name)[1],]$perct2013 * stockNames2[grep("Kieron", stockNames2$name)[1],]$newPerct2013 * pop2013 # "Kieron George" odds

surnames[grep("LOTT", surnames$name)[1],]$count # Number of Americans with last name "Lott", should be 25118

#### Files to export

write.csv(topFirstName, "new-top-firstNames.csv")
write.csv(topSurnames, "new-top-surnames.csv")
write.csv(nameMatches2, "independent-name-combinations-by-pop.csv")
write.csv(advMatrixFinal, "adjusted-name-combinations-matrix.csv")
write.csv(adjustedTotals, "adjusted-name-combinations-list.csv")