8 Data Manipulation with dplyr

https://learn.datacamp.com/courses/data-manipulation-with-dplyr

library(readr)
library(tidyverse)
## ── Attaching packages ───────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ dplyr   1.0.2
## ✓ tibble  3.0.3     ✓ stringr 1.4.0
## ✓ tidyr   1.1.2     ✓ forcats 0.5.0
## ✓ purrr   0.3.4
## ── Conflicts ──────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidycensus)
library(gapminder)
head(gapminder)
## # A tibble: 6 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.

8.1 Transforming Data with dplyr

dplyr verbs: select(), filter(), arrange(), mutate()

Select() extracts particular variables from a dataset:

new_table <- gapminder %>% select(country, continent, year, pop, gdpPercap)

We can use arrange() verb to arrange a columnn in ascending or descending order.

#Ascending Order:
selected_dataset <- gapminder %>% filter(year == 1977) %>% arrange(lifeExp)
head(selected_dataset)
## # A tibble: 6 x 6
##   country       continent  year lifeExp      pop gdpPercap
##   <fct>         <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Cambodia      Asia       1977    31.2  6978607      525.
## 2 Sierra Leone  Africa     1977    36.8  3140897     1348.
## 3 Guinea-Bissau Africa     1977    37.5   745228      765.
## 4 Afghanistan   Asia       1977    38.4 14880372      786.
## 5 Angola        Africa     1977    39.5  6162675     3009.
## 6 Guinea        Africa     1977    40.8  4227026      875.
#Descending Order:
gapminder %>% filter(year == 1977) %>% arrange(desc(lifeExp))
## # A tibble: 142 x 6
##    country     continent  year lifeExp       pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>     <int>     <dbl>
##  1 Iceland     Europe     1977    76.1    221823    19655.
##  2 Sweden      Europe     1977    75.4   8251648    18856.
##  3 Switzerland Europe     1977    75.4   6316424    26982.
##  4 Japan       Asia       1977    75.4 113872473    16610.
##  5 Norway      Europe     1977    75.4   4043205    23311.
##  6 Netherlands Europe     1977    75.2  13852989    21209.
##  7 Denmark     Europe     1977    74.7   5088419    20423.
##  8 Spain       Europe     1977    74.4  36439000    13237.
##  9 Canada      Americas   1977    74.2  23796400    22091.
## 10 France      Europe     1977    73.8  53165019    18293.
## # … with 132 more rows
head(selected_dataset)
## # A tibble: 6 x 6
##   country       continent  year lifeExp      pop gdpPercap
##   <fct>         <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Cambodia      Asia       1977    31.2  6978607      525.
## 2 Sierra Leone  Africa     1977    36.8  3140897     1348.
## 3 Guinea-Bissau Africa     1977    37.5   745228      765.
## 4 Afghanistan   Asia       1977    38.4 14880372      786.
## 5 Angola        Africa     1977    39.5  6162675     3009.
## 6 Guinea        Africa     1977    40.8  4227026      875.

We can use filter() verb to extract nly particular observation of a data set based on a condition.

selected_dataset %>% filter(lifeExp < 50)
## # A tibble: 34 x 6
##    country       continent  year lifeExp      pop gdpPercap
##    <fct>         <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Cambodia      Asia       1977    31.2  6978607      525.
##  2 Sierra Leone  Africa     1977    36.8  3140897     1348.
##  3 Guinea-Bissau Africa     1977    37.5   745228      765.
##  4 Afghanistan   Asia       1977    38.4 14880372      786.
##  5 Angola        Africa     1977    39.5  6162675     3009.
##  6 Guinea        Africa     1977    40.8  4227026      875.
##  7 Niger         Africa     1977    41.3  5682086      809.
##  8 Mali          Africa     1977    41.7  6491649      686.
##  9 Gambia        Africa     1977    41.8   608274      885.
## 10 Somalia       Africa     1977    42.0  4353666     1451.
## # … with 24 more rows

Mutate() verb can be used to add new variables or change current variables.

selected_dataset %>% mutate(total_gdp_inMill = (gdpPercap * pop)/1000000) %>% arrange(desc(total_gdp_inMill))
## # A tibble: 142 x 7
##    country        continent  year lifeExp       pop gdpPercap total_gdp_inMill
##    <fct>          <fct>     <int>   <dbl>     <int>     <dbl>            <dbl>
##  1 United States  Americas   1977    73.4 220239000    24073.         5301732.
##  2 Japan          Asia       1977    75.4 113872473    16610.         1891465.
##  3 Germany        Europe     1977    72.5  78160773    20513.         1603306.
##  4 United Kingdom Europe     1977    72.8  56179000    17429.          979130.
##  5 France         Europe     1977    73.8  53165019    18293.          972528.
##  6 Italy          Europe     1977    73.5  56059245    14256.          799180.
##  7 Brazil         Americas   1977    61.5 114313951     6660.          761344.
##  8 China          Asia       1977    64.0 943455000      741.          699324.
##  9 Canada         Americas   1977    74.2  23796400    22091.          525683.
## 10 India          Asia       1977    54.2 634000000      813.          515656.
## # … with 132 more rows

8.2 Aggregating Data

The count verb() can be used to count the number of observations. We can use count and sort together to count and then arrange. We can then use the wt arguement to wight it according to a certain variable.

selected_dataset %>% count(continent, sort = TRUE, wt = mean(lifeExp))
## # A tibble: 5 x 2
##   continent     n
##   <fct>     <dbl>
## 1 Oceania    72.9
## 2 Europe     71.9
## 3 Americas   64.4
## 4 Asia       59.6
## 5 Africa     49.6

Group by() and Summarize ():

We can use group by and summarize functions to group by a certain variable and then summarize from that group.

selected_dataset %>% 
  group_by(country) %>% 
  summarize(mean_lifeExp = mean(lifeExp),
            real_gdp_Bill = gdpPercap * pop/1000000000) %>% 
  arrange(desc(real_gdp_Bill))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 142 x 3
##    country        mean_lifeExp real_gdp_Bill
##    <fct>                 <dbl>         <dbl>
##  1 United States          73.4         5302.
##  2 Japan                  75.4         1891.
##  3 Germany                72.5         1603.
##  4 United Kingdom         72.8          979.
##  5 France                 73.8          973.
##  6 Italy                  73.5          799.
##  7 Brazil                 61.5          761.
##  8 China                  64.0          699.
##  9 Canada                 74.2          526.
## 10 India                  54.2          516.
## # … with 132 more rows

top_n verb

top_n is used to keep the most extreme observation from each group.

selected_dataset %>% group_by(continent) %>%
  top_n(1, lifeExp)
## # A tibble: 5 x 6
## # Groups:   continent [5]
##   country   continent  year lifeExp       pop gdpPercap
##   <fct>     <fct>     <int>   <dbl>     <int>     <dbl>
## 1 Reunion   Africa     1977    67.1    492095     4320.
## 2 Australia Oceania    1977    73.5  14074100    18334.
## 3 Canada    Americas   1977    74.2  23796400    22091.
## 4 Japan     Asia       1977    75.4 113872473    16610.
## 5 Iceland   Europe     1977    76.1    221823    19655.

8.3 Selecting and Transforming Data

We can use select() to select a range of columns or subtract a column.

We can use the contains arguement within select to get columns that contain a certain string. Other select helpers include starts_with(), ends_with(), last_col().

We can use the ?select_helper for more helpers.

selected_dataset %>% select(country, -pop, starts_with("con"), ends_with("ar"))
## # A tibble: 142 x 3
##    country       continent  year
##    <fct>         <fct>     <int>
##  1 Cambodia      Asia       1977
##  2 Sierra Leone  Africa     1977
##  3 Guinea-Bissau Africa     1977
##  4 Afghanistan   Asia       1977
##  5 Angola        Africa     1977
##  6 Guinea        Africa     1977
##  7 Niger         Africa     1977
##  8 Mali          Africa     1977
##  9 Gambia        Africa     1977
## 10 Somalia       Africa     1977
## # … with 132 more rows

Renaming :

rename() function can be used to rename a variable

selected_dataset %>% rename(population = pop)
## # A tibble: 142 x 6
##    country       continent  year lifeExp population gdpPercap
##    <fct>         <fct>     <int>   <dbl>      <int>     <dbl>
##  1 Cambodia      Asia       1977    31.2    6978607      525.
##  2 Sierra Leone  Africa     1977    36.8    3140897     1348.
##  3 Guinea-Bissau Africa     1977    37.5     745228      765.
##  4 Afghanistan   Asia       1977    38.4   14880372      786.
##  5 Angola        Africa     1977    39.5    6162675     3009.
##  6 Guinea        Africa     1977    40.8    4227026      875.
##  7 Niger         Africa     1977    41.3    5682086      809.
##  8 Mali          Africa     1977    41.7    6491649      686.
##  9 Gambia        Africa     1977    41.8     608274      885.
## 10 Somalia       Africa     1977    42.0    4353666     1451.
## # … with 132 more rows

Transmute verb:

Transmute is a combination of select and mutate. It returns a subset of columns that are transformed and changed.

selected_dataset %>%
  transmute(country, continent, pop_inMillion = pop/1000000)
## # A tibble: 142 x 3
##    country       continent pop_inMillion
##    <fct>         <fct>             <dbl>
##  1 Cambodia      Asia              6.98 
##  2 Sierra Leone  Africa            3.14 
##  3 Guinea-Bissau Africa            0.745
##  4 Afghanistan   Asia             14.9  
##  5 Angola        Africa            6.16 
##  6 Guinea        Africa            4.23 
##  7 Niger         Africa            5.68 
##  8 Mali          Africa            6.49 
##  9 Gambia        Africa            0.608
## 10 Somalia       Africa            4.35 
## # … with 132 more rows

8.4 Case Study: The babynames Dataset

We can filter for multiple observations by using %in% .

selected_dataset %>% filter(country %in% c("Gambia", "Somalia"))
## # A tibble: 2 x 6
##   country continent  year lifeExp     pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>   <int>     <dbl>
## 1 Gambia  Africa     1977    41.8  608274      885.
## 2 Somalia Africa     1977    42.0 4353666     1451.

Grouped Mutates:

selected_dataset %>% 
  group_by(continent) %>%
  mutate(world_lifeExp_mean = mean(lifeExp))
## # A tibble: 142 x 7
## # Groups:   continent [5]
##    country       continent  year lifeExp      pop gdpPercap world_lifeExp_mean
##    <fct>         <fct>     <int>   <dbl>    <int>     <dbl>              <dbl>
##  1 Cambodia      Asia       1977    31.2  6978607      525.               59.6
##  2 Sierra Leone  Africa     1977    36.8  3140897     1348.               49.6
##  3 Guinea-Bissau Africa     1977    37.5   745228      765.               49.6
##  4 Afghanistan   Asia       1977    38.4 14880372      786.               59.6
##  5 Angola        Africa     1977    39.5  6162675     3009.               49.6
##  6 Guinea        Africa     1977    40.8  4227026      875.               49.6
##  7 Niger         Africa     1977    41.3  5682086      809.               49.6
##  8 Mali          Africa     1977    41.7  6491649      686.               49.6
##  9 Gambia        Africa     1977    41.8   608274      885.               49.6
## 10 Somalia       Africa     1977    42.0  4353666     1451.               49.6
## # … with 132 more rows

Window Function:

A window function takes a vector and returns another vector of the same length. We can use the lag() function.

v <- c(1,3,5,14)
lag(v)
## [1] NA  1  3  5
gapminder %>% 
  select(country, year, lifeExp) %>%
  filter(country == "Bangladesh") %>%
  arrange(year) %>%
  mutate (difference_in_lifeExp = lifeExp - lag(lifeExp)) %>%
  arrange(desc(difference_in_lifeExp))
## # A tibble: 12 x 4
##    country     year lifeExp difference_in_lifeExp
##    <fct>      <int>   <dbl>                 <dbl>
##  1 Bangladesh  1997    59.4                  3.39
##  2 Bangladesh  1992    56.0                  3.20
##  3 Bangladesh  1982    50.0                  3.09
##  4 Bangladesh  1987    52.8                  2.81
##  5 Bangladesh  2002    62.0                  2.60
##  6 Bangladesh  1967    43.5                  2.24
##  7 Bangladesh  2007    64.1                  2.05
##  8 Bangladesh  1962    41.2                  1.87
##  9 Bangladesh  1957    39.3                  1.86
## 10 Bangladesh  1972    45.3                  1.80
## 11 Bangladesh  1977    46.9                  1.67
## 12 Bangladesh  1952    37.5                 NA