1.1 Download and prepare data

1.1.1 Loading library

1.1.2 Loading variables and data wrangling

# loading the variables 2019 was chosen since it is the most recent
variables <- load_variables(2019, "acs5", cache = TRUE)

# Housing Variables-Parker K

## GIS version for Maps
MedianRentGIS <- get_acs(
  geography = "county",
  variables = "B25064_001",
  geometry = TRUE,
  keep_geo_vars = TRUE
) %>%
  select(GEOID,
    stFIPS = STATEFP,
    coFIPS = COUNTYFP,
    coNAME = NAME.x,
    median_rent = estimate,
    geometry
  )

## Non GIS version for regressions
MedianRent <- get_acs(
  geography = "county",
  variables = "B25064_001",
  geometry = FALSE
) %>%
  select(GEOID,
    NAME,
    median_rent = estimate
  )

TotalRent <- get_acs(
  geography = "county",
  variables = c("B01003_001", "B25003_003"),
  geometry = FALSE
) %>%
  select(-moe) %>%
  spread(variable, estimate) %>%
  mutate(
    B01003_001 = as.numeric(B01003_001),
    B25003_003 = as.numeric(B25003_003),
    perc_rent = (B25003_003 / B01003_001) * 100
  ) %>%
  select(-c("B01003_001", "B25003_003", "NAME"))

# Demographic variables-Hung Nguyen
MedianAge <- get_acs(
  geography = "county",
  variables = "B01002_001",
  geometry = FALSE
) %>%
  select(GEOID,
    median_age = estimate
  )

Race <- get_acs(
  geography = "county",
  variables = c("B02001_001", "B02001_002"),
  geometry = FALSE
) %>%
  select(-moe) %>%
  spread(variable, estimate) %>%
  mutate(
    B02001_001 = as.numeric(B02001_001),
    B02001_002 = as.numeric(B02001_002),
    perc_white = (B02001_002 / B02001_001) * 100
  ) %>%
  select(-c("B02001_001", "B02001_002", "NAME"))


# Income variables- Raian Rith

IncBelowPovertyLevel <- get_acs(
  geography = "county",
  variables = c("B01003_001", "B17021_002"),
  geometry = FALSE
) %>%
  select(-moe) %>%
  spread(variable, estimate) %>%
  mutate(
    B01003_001 = as.numeric(B01003_001),
    B17021_002 = as.numeric(B17021_002),
    perc_below_pov = (B17021_002 / B01003_001) * 100
  ) %>%
  select(-c("B01003_001", "B17021_002", "NAME"))

MedianIncome <- get_acs(
  geography = "county",
  variables = "B06011_001",
  geometry = FALSE
) %>%
  select(GEOID,
    median_income = estimate
  )

# Education variables- Gui
HighSchool <- get_acs(
  geography = "county", # Change to percentage
  variables = c("B06009_001", "B06009_002"),
  geometry = FALSE
) %>%
  select(-moe) %>%
  spread(variable, estimate) %>%
  mutate(
    B06009_001 = as.numeric(B06009_001),
    B06009_002 = as.numeric(B06009_002),
    perc_hs = (B06009_002 / B06009_001) * 100
  ) %>%
  select(-c("B06009_001", "B06009_002", "NAME"))

Doctorate <- get_acs(
  geography = "county", # Change to percentage
  variables = c("B06009_001", "B15003_025"),
  geometry = FALSE
) %>%
  select(-moe) %>%
  spread(variable, estimate) %>%
  mutate(
    B06009_001 = as.numeric(B06009_001),
    B15003_025 = as.numeric(B15003_025),
    perc_doc = (B15003_025 / B06009_001) * 100
  ) %>%
  select(-c("B06009_001", "B15003_025", "NAME"))

# State data (for displaying state borders on map)
stateGIS <- get_acs(
  geography = "state",
  variables = "B01001_001",
  geometry = TRUE,
  keep_geo_vars = TRUE
)

1.1.3 Merging data

# Merging 8 variables
## Version 1: with geometry information
metadataGIS <- MedianRentGIS %>%
  inner_join(TotalRent, by = "GEOID") %>%
  inner_join(MedianAge, by = "GEOID") %>%
  inner_join(Race, by = "GEOID") %>%
  inner_join(IncBelowPovertyLevel, by = "GEOID") %>%
  inner_join(MedianIncome, by = "GEOID") %>%
  inner_join(HighSchool, by = "GEOID") %>%
  inner_join(Doctorate, by = "GEOID") %>%
  rename(FIPS = GEOID)

## Version 2: without geometry information
metadata <- MedianRent %>%
  inner_join(TotalRent, by = "GEOID") %>%
  inner_join(MedianAge, by = "GEOID") %>%
  inner_join(Race, by = "GEOID") %>%
  inner_join(IncBelowPovertyLevel, by = "GEOID") %>%
  inner_join(MedianIncome, by = "GEOID") %>%
  inner_join(HighSchool, by = "GEOID") %>%
  inner_join(Doctorate, by = "GEOID") %>%
  rename(FIPS = GEOID)

1.1.4 Processing data

# Simplify GIS data to make file sizes smaller. This essentially removes some details along coastlines and very-not-straight borders.

stateGIS <- ms_simplify(stateGIS, keep = 0.01)
metadataGIS <- ms_simplify(metadataGIS, keep = 0.01)

## join 2-character state abbreviation and create name = county, ST for labeling maps
fipsToSTcode <- fips_codes %>%
  unite("FIPS", state_code, county_code, sep = "", remove = FALSE) %>%
  select(FIPS, stFIPS = state_code, stNAME = state)

metadataGIS <- inner_join(metadataGIS, fipsToSTcode, by = c("FIPS", "stFIPS"))
metadataGIS <- metadataGIS %>% mutate(name = paste0(coNAME, ", ", stNAME))

## join stFIPS for metadata
metadata <- inner_join(metadata, fipsToSTcode, by = c("FIPS"))

## For maps, drop the following:
##   Puerto Rico (ST FIPS 72) (no election data)
##   Alaska (ST FIPS 02) (voting data isn't reported by county...we could also map the legislative districts, but we're not going to since we'd rather have smaller maps without those extra details)
##   Hawaii (ST FIPS 15) (so our map can zoom in on continental 48 states)
stateGIS <- stateGIS %>% filter(GEOID != "72" & GEOID != "02" & GEOID != "15")
metadataGIS <- metadataGIS %>% filter(stFIPS != "72" & stFIPS != "02" & stFIPS != "15")
metadata <- metadata %>% filter(stFIPS != "72" & stFIPS != "02")

1.1.5 Election data download and preparation

## 2020 Election data
dta2020 <- read_csv("https://raw.githubusercontent.com/tonmcg/US_County_Level_Election_Results_08-20/master/2020_US_County_Level_Presidential_Results.csv")

## Calculate percentages based on total votes for Trump and Biden (GOP and Dem) only
##   In some years there have been ties, so we're allowing for that
##   stdVotes and stdVotesLog will be used to scale color opacitiy from 0 to 1 based on total votes

dta2020 <- dta2020 %>%
  mutate(
    pctGOP = votes_gop / (votes_gop + votes_dem),
    totalVotes = votes_gop + votes_dem,
    winner = ifelse(dta2020$votes_gop > dta2020$votes_dem, "Trump",
      ifelse(dta2020$votes_gop < dta2020$votes_dem, "Biden",
        "Tie"
      )
    ),
    pctWinner = ifelse(dta2020$votes_gop > dta2020$votes_dem, pctGOP, 1 - pctGOP),
    FontColorWinner = ifelse(dta2020$votes_gop > dta2020$votes_dem, "red",
      ifelse(dta2020$votes_gop < dta2020$votes_dem, "blue",
        "purple"
      )
    ),
    pctGOPcategories = case_when(
      between(pctGOP, 0, 0.3) ~ "0-30%",
      between(pctGOP, 0.3, 0.4) ~ "30-40%",
      between(pctGOP, 0.4, 0.45) ~ "40-45%",
      between(pctGOP, 0.45, 0.49) ~ "45-49%",
      between(pctGOP, 0.49, 0.51) ~ "49-51%",
      between(pctGOP, 0.51, 0.55) ~ "51-55%",
      between(pctGOP, 0.55, 0.60) ~ "55-60%",
      between(pctGOP, 0.60, 0.70) ~ "60-70%",
      between(pctGOP, 0.70, 1) ~ "70-100%"
    ),
    stdVotes = (totalVotes - min(totalVotes)) / (max(totalVotes) - min(totalVotes)),
    stdVotesLog = (log(totalVotes) - min(log(totalVotes))) / (max(log(totalVotes)) - min(log(totalVotes)))
  )



dta2020 <- dta2020 %>%
  select(FIPS = county_fips, pctGOP, totalVotes, winner, pctWinner, pctGOPcategories, FontColorWinner, stdVotes, stdVotesLog)


## merge metadataGIS with voting data

metadataGIS <- left_join(metadataGIS, dta2020, by = "FIPS")

## merge metadata with with voting data

metadata <- left_join(metadata, dta2020, by = "FIPS")

1.1.6 Metadata cleaning

metadata %>%
  filter(!complete.cases(metadata))
## # A tibble: 5 x 20
##   FIPS  NAME  median_rent perc_rent median_age perc_white perc_below_pov
##   <chr> <chr>       <dbl>     <dbl>      <dbl>      <dbl>          <dbl>
## 1 15005 Kala~        1063     59.1        57.4       24.2           9.09
## 2 32011 Eure~          NA     10.9        47.7       93.8           7.96
## 3 48033 Bord~          NA      8.97       41.7       97.6           2.61
## 4 48301 Lovi~          NA     11.2        55.2       96.9          15.3 
## 5 48443 Terr~          NA      3.24       55.2       66.1           9.38
## # ... with 13 more variables: median_income <dbl>, perc_hs <dbl>,
## #   perc_doc <dbl>, stFIPS <chr>, stNAME <chr>, pctGOP <dbl>, totalVotes <dbl>,
## #   winner <chr>, pctWinner <dbl>, pctGOPcategories <chr>,
## #   FontColorWinner <chr>, stdVotes <dbl>, stdVotesLog <dbl>
metadataGIS %>%
  filter(!complete.cases(as.tibble(metadataGIS)[, 1:22])) %>%
  as.tibble()
## # A tibble: 4 x 23
##   FIPS  stFIPS coFIPS coNAME median_rent perc_rent median_age perc_white
##   <chr> <chr>  <chr>  <chr>        <dbl>     <dbl>      <dbl>      <dbl>
## 1 48033 48     033    Borden          NA      8.97       41.7       97.6
## 2 48443 48     443    Terre~          NA      3.24       55.2       66.1
## 3 48301 48     301    Loving          NA     11.2        55.2       96.9
## 4 32011 32     011    Eureka          NA     10.9        47.7       93.8
## # ... with 15 more variables: perc_below_pov <dbl>, median_income <dbl>,
## #   perc_hs <dbl>, perc_doc <dbl>, stNAME <chr>, name <chr>, pctGOP <dbl>,
## #   totalVotes <dbl>, winner <chr>, pctWinner <dbl>, pctGOPcategories <chr>,
## #   FontColorWinner <chr>, stdVotes <dbl>, stdVotesLog <dbl>,
## #   geometry <MULTIPOLYGON [°]>
metadata <- metadata %>%
  filter(complete.cases(metadata))

metadataGIS <- metadataGIS %>%
  filter(complete.cases(as.tibble(metadataGIS)[, 1:22]))

Here, we see that there are problems that occur with median_rent not having values in the areas with small populations, such as Eureka County, Nevada, Borden County, Texas, Loving County, Texas, and Terrell County, Texas. In addition, one county from Hawaii is missing election data. This is only in the metadata. In order to treat this, and upon careful consideration, we have decided to: remove those observations.