join_code.Rmd

---
title: "joindfs"
output: html_document
date: "2025-01-04"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
# Load required package
library(dplyr)
library(stringr)


```


```{r}
#setwd("C:/Users/HP/Desktop/Msc Econometrics/ABDA/project")


# Read the CSV files
df_elect_res <- read.csv("./data/election_results_house.csv")
df_races <- read.csv("./data/races.csv")
df_urb_index <- read.csv("./data/urbanization-index-2022.csv")

head(df_elect_res)
head(df_races)
head(df_urb_index)

```

```{r}
#Investigate at largest districts 
df_urb_index %>% filter(state == "WY")
  # Wyomings at large district is named 01, so the clean data was renamed to reflect this 
```

```{r}
# Filter then join approach 

df_house_races_2022 = df_races %>% filter(cycle == 2022, type == "HouseRace",stage == "general" | stage == "runoff", special == "false") 

df_house_races_2022$date = as.Date(df_house_races_2022$date,"%m/%d/%y")

head(df_house_races_2022) 

#df_house_races_2022 includes all states with correct number district and puerto rico 
# df_elect_res %>% filter(state_abbrev == "NY", office_seat_name == "District 19", cycle == "2022") # test for ny
```


```{r}
# Join df_elect_res and df_races, by race id

# adjust column names

df_races2 <- df_house_races_2022 %>% 
  rename(
    race_id = id 
    )


# inner_join because we only need the house races

# joined1 <- df_elect_res %>%
#   inner_join(df_races2, by = "race_id")

# Drop identical columns
# df1 <- joined1 %>%
#   select(-one_of(names(joined1)[sapply(names(joined1), function(col) {
#     if (col %in% c("state", "district")) return(FALSE)
#     if (!grepl("\\.x$", col)) return(FALSE)
#     corresponding_col <- sub("\\.x$", ".y", col)
#     corresponding_col %in% names(joined1) && all(joined1[[col]] == joined1[[corresponding_col]], na.rm = TRUE)
#   })]))


# View the resulting dataset
# head(df1)

```


```{r}

# Join the datasets by 'state' and 'district'

# Adjust district code and col names

df2 <- df_house_races_2022 %>%
  mutate(district = as.double(str_remove(office_seat_name, "^District ")), state_code = state_abbrev) 

df3 <- df_urb_index %>%
  rename(
    district = cd,
    state_code = state
  )


# Adjust the type of join (inner_join, left_join, etc.) as needed
df_all_vars <- df3 %>%
  inner_join(df2, by = c("state_code", "district")) 

# # View the resulting dataset
# head(df_all_vars)
# 
# 
# # Count unique values in each column
# unique_counts <- df_all_vars %>%
#   summarise(across(everything(), ~ n_distinct(.)))
# 
# # View the counts as a dataframe
# unique_counts


```


```{r}


# 
# # df_all_vars_2022 <- df_all_vars %>%
# #   filter(cycle.y == 2022)
# 
# 
# 
# # View the resulting dataset
# head(df_all_vars_2022)
# 
# 
# # Count unique values in each column
# unique_counts <- df_all_vars_2022 %>%
#   summarise(across(everything(), ~ n_distinct(.)))
# 
# # View the counts as a dataframe
# unique_counts


```


```{r}

df_demographics <- read.csv("./data/us_demographic_data_clean.csv")


head(df_demographics)

# Adjust the variables for joining

# df_demographics <- df_demographics %>%
#   mutate(district = case_when(
#     str_detect(Name, "at Large") ~ "1",  # For at-large districts, assign "1"
#     TRUE ~ str_extract(Name, "(?<=District )\\d+")  # Extract number after "District"
#   ))

df_demographics <- df_demographics %>%
  rename(
    district = Congressional.District.Number,
  )


head(df_demographics)


```


```{r}

# join everything

joined_all <- df_all_vars %>%
  inner_join(df_demographics, by = c("State", "district")) 


head(joined_all)

```


```{r}

# Define the columns to keep
cols_to_keep <- c("stcd","State", "district","urbanindex","grouping","incumbent_party", "Winning.party","Total.Population", "Percentage.Women", "Median.Household.Income", "Mean.Household.Income", "Median.Age", "Percentage.Retirees..65..", "Percentage.Bachelors.Degree.in.Population.over.25", "Unemployment.Rate..16.and.over.")

# took out "winner", "percent", and "ballot party" because they dont exist wihtout the results 

# Keep only the specified columns
joined_final <- joined_all %>%
  select(all_of(cols_to_keep))


```

```{r}
## Add Regions Data 

# import region data 
regions <- read.csv("./data/regions.csv")
# join data 
joined_final <- joined_final %>%
  left_join(regions, by = "State") 
```


```{r}
# Save your dataframe as a CSV file
write.csv(joined_final, "final_data_with_regions.csv", row.names = FALSE)

```


```{r}
#replace DEM with 1 and REP with 0
joined_final <- joined_final %>%
  mutate(incumbent_party = ifelse(incumbent_party == "DEM", 1L, ifelse(incumbent_party == "REP", 0L, incumbent_party))) %>%
  mutate(Winning.party = ifelse(Winning.party == "DEM", 1L, ifelse(Winning.party == "REP", 0L, Winning.party)))


# Louisiana has no winning party noted for any district, but we know incumbent won in all

joined_final <- joined_final %>%
  mutate(Winning.party = ifelse(State == "Louisiana", incumbent_party, Winning.party))


# change column names to numeric 
joined_final$Winning.party <- as.integer(joined_final$Winning.party)
joined_final$incumbent_party <- as.integer(joined_final$incumbent_party)


```


```{r}
# Save your dataframe as a CSV file
write.csv(joined_final, "final_data_with_regions_and_winners.csv", row.names = FALSE)
```