Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

don't pad 4 digit tracts and change column names to start with census… #25

Merged
merged 2 commits into from
Jul 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 25 additions & 25 deletions 00_make_block_group_shp.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ library(sf)
blk_grps_sf_2010 <- st_read("nhgis0002_shape/nhgis0002_shapefile_tl2010_us_blck_grp_2010/US_blck_grp_2010.shp")

blk_grps_sf_2010 <- st_transform(blk_grps_sf_2010, crs=5072) %>%
dplyr::select(fips_block_group_id_2010 = GEOID10,
dplyr::select(census_block_group_id_2010 = GEOID10,
geometry)

blk_grps_sf_2010 <- sf::st_make_valid(blk_grps_sf_2010)
Expand All @@ -17,9 +17,9 @@ saveRDS(blk_grps_sf_2010, "block_groups_2010_5072.rds")
blk_grps_sf_2000 <- st_read("nhgis0015_shape/nhgis0015_shapefile_tl2000_us_blck_grp_2000/US_blck_grp_2000.shp")

blk_grps_sf_2000 <- st_transform(blk_grps_sf_2000, crs=5072) %>%
dplyr::select(fips_block_group_id_2000 = STFID,
dplyr::select(census_block_group_id_2000 = STFID,
geometry) %>%
mutate(fips_block_group_id_2000 = as.character(fips_block_group_id_2000))
mutate(census_block_group_id_2000 = as.character(census_block_group_id_2000))

blk_grps_sf_2000 <- sf::st_make_valid(blk_grps_sf_2000)

Expand All @@ -31,17 +31,13 @@ blk_grps_sf_1990 <- st_read("nhgis0016_shape/nhgis0016_shapefile_tl2000_us_blck_

blk_grps_sf_1990 <- st_transform(blk_grps_sf_1990, crs=5072)

blk_grps_sf_1990 %>%
dplyr::mutate(state_fips = stringr::str_sub(string = GISJOIN2, 1, 2))

blk_grps_sf_1990 <- blk_grps_sf_1990 %>%
dplyr::mutate(state_fips = stringr::str_sub(blk_grps_sf_1990$GISJOIN2, 1, 2),
county_fips = stringr::str_sub(blk_grps_sf_1990$GISJOIN2, 4, 6),
tract_fips = stringr::str_sub(blk_grps_sf_1990$GISJOIN2, 8, -2),
tract_fips = stringr::str_pad(tract_fips, 6, pad = "0"),
fips_block_group_id_1990 = glue::glue('{state_fips}{county_fips}{tract_fips}{GROUP}')) %>%
select(fips_block_group_id_1990, geometry) %>%
mutate(fips_block_group_id_1990 = as.character(fips_block_group_id_1990))
census_block_group_id_1990 = glue::glue('{state_fips}{county_fips}{tract_fips}{GROUP}')) %>%
select(census_block_group_id_1990, geometry) %>%
mutate(census_block_group_id_1990 = as.character(census_block_group_id_1990))

blk_grps_sf_1990 <- sf::st_make_valid(blk_grps_sf_1990)

Expand All @@ -54,34 +50,38 @@ tracts_sf_1980 <- st_read('nhgis0018_shape/nhgis0018_shapefile_tl2000_us_tract_1
tracts_sf_1980 <- st_transform(tracts_sf_1980, crs=5072)

tracts_sf_1980 <- tracts_sf_1980 %>%
dplyr::mutate(state_fips = stringr::str_sub(NHGISST, 1, 2),
county_fips = stringr::str_sub(NHGISCTY, 1, 3),
tract_fips = stringr::str_sub(GISJOIN, 9, -1),
tract_fips = stringr::str_pad(tract_fips, 6, pad = "0"),
fips_tract_id_1980 = glue::glue('{state_fips}{county_fips}{tract_fips}')) %>%
dplyr::select(fips_tract_id_1980,
mutate(state_fips = stringr::str_sub(NHGISST, 1, 2),
county_fips = stringr::str_sub(NHGISCTY, 1, 3),
tract_fips = stringr::str_sub(GISJOIN2, 8, -1),
census_tract_id_1980 = glue::glue('{state_fips}{county_fips}{tract_fips}')) %>%
dplyr::select(census_tract_id_1980,
geometry) %>%
mutate(fips_tract_id_1980 = as.character(fips_tract_id_1980))
mutate(census_tract_id_1980 = as.character(census_tract_id_1980))

tracts_sf_1980 <- sf::st_make_valid(tracts_sf_1980)

saveRDS(tracts_sf_1980, "tracts_1980_5072.rds")
# s3 location s3://geomarker/geometries/tracts_1980_5072.rds

# 1970
tracts_sf_1970 <- st_read('nhgis0018_shape/nhgis0018_shapefile_tl2000_us_tract_1970/US_tract_1970.shp')
tracts_sf_1970 <- st_read('~/Downloads/nhgis0018_shape/nhgis0018_shapefile_tl2000_us_tract_1970/US_tract_1970.shp')

tracts_sf_1970 <- st_transform(tracts_sf_1970, crs=5072)

## remove incomplete tract identifiers (e.g. "3600050nodata")
remove <- tracts_sf_1970[duplicated(tracts_sf_1970$GISJOIN2),]$GISJOIN2

tracts_sf_1970 <- tracts_sf_1970 %>%
filter(!GISJOIN2 %in% remove)

tracts_sf_1970 <- tracts_sf_1970 %>%
dplyr::mutate(state_fips = stringr::str_sub(NHGISST, 1, 2),
county_fips = stringr::str_sub(NHGISCTY, 1, 3),
tract_fips = stringr::str_sub(GISJOIN, 9, -1),
tract_fips = stringr::str_pad(tract_fips, 6, pad = "0"),
fips_tract_id_1970 = glue::glue('{state_fips}{county_fips}{tract_fips}')) %>%
dplyr::select(fips_tract_id_1970,
mutate(state_fips = stringr::str_sub(NHGISST, 1, 2),
county_fips = stringr::str_sub(NHGISCTY, 1, 3),
tract_fips = stringr::str_sub(GISJOIN2, 8, -1),
census_tract_id_1970 = glue::glue('{state_fips}{county_fips}{tract_fips}')) %>%
dplyr::select(census_tract_id_1970,
geometry) %>%
mutate(fips_tract_id_1970 = as.character(fips_tract_id_1970))
mutate(census_tract_id_1970 = as.character(census_tract_id_1970))

tracts_sf_1970 <- sf::st_make_valid(tracts_sf_1970)

Expand Down
8 changes: 3 additions & 5 deletions 01_get_2020_block_groups.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ states <-

block_group_shp <- pmap(list(states$fl_name, states$state_folder, states$STATEFP),
possibly(get_block_group_shp, NA_real_))
#block_group_shp[[6]] <- get_block_group_shp(states$fl_name[6], states$state_folder[6], states$STATEFP[6])
names(block_group_shp) <- states$NAME

block_group_shp_all <- block_group_shp[[1]]
Expand All @@ -35,13 +36,10 @@ for (i in 2:length(block_group_shp)) {
}

blk_grps_sf_2020 <- block_group_shp_all %>%
dplyr::select(fips_block_group_id_2020 = GEOID20,
dplyr::select(census_block_group_id_2020 = GEOID20,
geometry) %>%
mutate(fips_block_group_id_2020 = as.character(fips_block_group_id_2020))
mutate(census_block_group_id_2020 = as.character(census_block_group_id_2020))

saveRDS(blk_grps_sf_2020, "block_groups_2020_5072.rds")





2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ FROM rocker/r-ver:4.0.4

# DeGAUSS container metadata
ENV degauss_name="census_block_group"
ENV degauss_version="0.5.1"
ENV degauss_version="0.6.0"
ENV degauss_description="census block group and tract"
ENV degauss_argument="census year [default: 2010]"

Expand Down
30 changes: 18 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
If `my_address_file_geocoded.csv` is a file in the current working directory with coordinate columns named `lat` and `lon`, then the [DeGAUSS command](https://degauss.org/using_degauss.html#DeGAUSS_Commands):

```sh
docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/census_block_group:0.5.1 my_address_file_geocoded.csv
docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/census_block_group:0.6.0 my_address_file_geocoded.csv
```

will produce `my_address_file_geocoded_census_block_group_0.5.1_2010.csv` with added columns:
will produce `my_address_file_geocoded_census_block_group_0.6.0_2010.csv` with added columns:

- **`fips_block_group_id_2010`**: identifier for 2010 block group
- **`fips_tract_id_2010`**: identifier for 2010 tract
- **`census_block_group_id_2010`**: identifier for 2010 block group
- **`census_tract_id_2010`**: identifier for 2010 tract

### Optional Argument

Expand All @@ -24,12 +24,24 @@ The default census year is 2010, but can be changed by supplying an optional arg
docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/census_block_group:0.5.0 my_address_file_geocoded.csv 1990
```

will produce `my_address_file_geocoded_census_block_group_0.5.1_1990.csv`, with columns called **`fips_block_group_id_1990`** and **`fips_tract_id_1990`**.
will produce `my_address_file_geocoded_census_block_group_0.6.0_1990.csv`, with columns called **`census_block_group_id_1990`** and **`census_tract_id_1990`**.

Available years for census block group and census tract identifiers include 1990, 2000, 2010, and 2020. Additionally, tracts identifiers are available for 1970 and 1980.

## st_census_tract

For spatiotemporal data in which each location is associated with a specified date range, consider using the [`st_census_tract`](https://degauss.org/st_census_tract/) container, which adds census tract identifiers for the appropriate vintage (1970-2020) based on `start_date` and `end_date` for each input location.

## Geomarker Methods

- Block group shape files were downloaded from [nhgis.org](nhgis.org) and reprojected to EPSG 5072.

- All shape files were made valid using `sf::st_make_valid`.

- 2020 block groups were not yet available via NHGIS, and were downloaded directly from the [U.S. Census](www2.census.gov).

## Geomarker Data

- Census block groups are a low level designation within the US Census geographical hierarchy, one degree finer than a census tract. The US Census provides a diagram visualizing the [hierarchy](https://www2.census.gov/geo/pdfs/reference/geodiagram.pdf).
- The first 11 characters in a census block group GEOID indicate the census tract, county and state that the block group lies within. The US Census GEOIDs are constructed in a manner that reflects the geographical hierary of the designated area. By using the segments of the GEOID, it is possible to select data based on area types further up in the hierarchy.

Expand All @@ -40,13 +52,7 @@ Available years for census block group and census tract identifiers include 1990
| Census Tract | State + County + Tract | 2+3+6=11 | Tract 32 in Hamilton County | 39061003200 |
| Block Group | State + County + Tract +<br /> Block Group | 2+3+6+1=12 | Block Group 1 in Tract 32 | 390610032001 |

Due to inconsistencies in the 1970 and 1980 tract identifiers, we concatenated the state FIPS (`NHGISST`), county FIPS (`NGHISCTY`), and tract FIPS (the last 4 or 6 digits of `GISJOIN2`) to construct the full `fips_tract_id`. Since the length of tract FIPS codes varied, we padded all tract FIPS to the maximum 6 digits using zeros.

## st_census_tract

For spatiotemporal data in which each location is associated with a specified date range, consider using the [`st_census_tract`](https://degauss.org/st_census_tract/) container, which adds census tract identifiers for the appropriate vintage (1970-2020) based on `start_date` and `end_date` for each input location.

## Geomarker Data
*Block Group identifiers are defined as the concatenation of the state, county, tract, and block group fips identifiers (commonly called GISJOIN or GEOID in census data). All census tract identifiers are 11 digits and all census block group identifiers are 12 digits, with the exception of some 1990, 1980, and 1970 tracts that are 9 digits, resulting in 10 digit block group identifiers.*

- block group shapefiles for 1990, 2000, and 2010, as well as tract shapefiles for 1970 and 1980, were obtained from [NHGIS](https://www.nhgis.org/) and transformed using the `00_make_block_group_shp.R` file in this repository.

Expand Down
4 changes: 2 additions & 2 deletions entrypoint.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ d$d <- suppressWarnings( sf::st_join(d$d, geography, left = FALSE, largest = TRU

if(! opt$census_year %in% c('1980', '1970')) {
d$d <- d$d %>%
mutate_at(vars(starts_with(glue::glue('fips_block_group_id_{opt$census_year}'))),
list(fips_tract_id = ~stringr::str_sub(.x, 1, 11)))
mutate_at(vars(starts_with(glue::glue('census_block_group_id_{opt$census_year}'))),
list(census_tract_id = ~stringr::str_sub(.x, 1, 11)))

names(d$d)[ncol(d$d)] <- glue::glue('{names(d$d)[ncol(d$d)]}_{opt$census_year}')
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,lat,lon,start_date,end_date,census_tract_id_1970
55001310120,NA,NA,6/11/20,6/18/20,NA
55000100280,39.19674,-84.582601,3/1/17,3/8/17,39061020801
55000100281,39.28765,-84.510173,1/30/12,2/6/12,39061021502
55000100282,39.158521,-84.417572,12/1/20,12/8/20,390610054
55000100283,39.158521,-84.417572,4/8/19,4/15/19,390610054
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,lat,lon,start_date,end_date,census_tract_id_1980
55001310120,NA,NA,6/11/20,6/18/20,NA
55000100280,39.19674,-84.582601,3/1/17,3/8/17,39061020801
55000100281,39.28765,-84.510173,1/30/12,2/6/12,39061021505
55000100282,39.158521,-84.417572,12/1/20,12/8/20,390610054
55000100283,39.158521,-84.417572,4/8/19,4/15/19,390610054
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,lat,lon,start_date,end_date,census_block_group_id_1990,census_tract_id_1990
55001310120,NA,NA,6/11/20,6/18/20,NA,NA
55000100280,39.19674,-84.582601,3/1/17,3/8/17,390610208011,39061020801
55000100281,39.28765,-84.510173,1/30/12,2/6/12,390610215051,39061021505
55000100282,39.158521,-84.417572,12/1/20,12/8/20,3906100543,3906100543
55000100283,39.158521,-84.417572,4/8/19,4/15/19,3906100543,3906100543
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,lat,lon,start_date,end_date,census_block_group_id_2000,census_tract_id_2000
55001310120,NA,NA,6/11/20,6/18/20,NA,NA
55000100280,39.19674,-84.582601,3/1/17,3/8/17,390610208111,39061020811
55000100281,39.28765,-84.510173,1/30/12,2/6/12,390610215051,39061021505
55000100282,39.158521,-84.417572,12/1/20,12/8/20,390610054001,39061005400
55000100283,39.158521,-84.417572,4/8/19,4/15/19,390610054001,39061005400
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,lat,lon,start_date,end_date,census_block_group_id_2010,census_tract_id_2010
55001310120,NA,NA,6/11/20,6/18/20,NA,NA
55000100280,39.19674,-84.582601,3/1/17,3/8/17,390610208111,39061020811
55000100281,39.28765,-84.510173,1/30/12,2/6/12,390610215051,39061021505
55000100282,39.158521,-84.417572,12/1/20,12/8/20,390610054001,39061005400
55000100283,39.158521,-84.417572,4/8/19,4/15/19,390610054001,39061005400
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,lat,lon,start_date,end_date,census_block_group_id_2020,census_tract_id_2020
55001310120,NA,NA,6/11/20,6/18/20,NA,NA
55000100280,39.19674,-84.582601,3/1/17,3/8/17,390610208111,39061020811
55000100281,39.28765,-84.510173,1/30/12,2/6/12,390610215051,39061021505
55000100282,39.158521,-84.417572,12/1/20,12/8/20,390610276001,39061027600
55000100283,39.158521,-84.417572,4/8/19,4/15/19,390610276001,39061027600