dplyr basics

Pliers

This blog post demonstrates the usage of the R package dplyr. It turns out that dplyr is intuitive to the point where I probably won’t ever need to look back at this summary. A nice and very concise dplyr and tidyr cheat sheet is available from RStudio.

The following was compiled in rmarkdown [ download .Rmd file]

library(dplyr)
library(gapminder)

The gapminder data will be used for demonstration purposes.

class(gapminder)

## [1] "tbl_df"     "tbl"        "data.frame"

glimpse – a better str

glimpse(gapminder)

## Observations: 1,704
## Variables: 6
## $ country   <fctr> Afghanistan, Afghanistan, Afghanistan, Afghani...
## $ continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia...
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987,...
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438,...
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460,...
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.981...

dplyr verbs

sample_frac – sample a given percentage of rows

sample_frac(gapminder, 0.5)

## # A tibble: 852 × 6
##        country continent  year lifeExp      pop gdpPercap
##         <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>
## 1       Taiwan      Asia  1977  70.590 16785196  5596.520
## 2  Puerto Rico  Americas  1972  72.160  2847132  9123.042
## 3      Croatia    Europe  1957  64.770  3991242  4338.232
## 4       Panama  Americas  2002  74.712  2990875  7356.032
## 5       Canada  Americas  1992  77.950 28523502 26342.884
## 6       Poland    Europe  1982  71.320 36227381  8451.531
## 7  Puerto Rico  Americas  1957  68.540  2260000  3907.156
## 8        Chile  Americas  1982  70.565 11487112  5095.666
## 9      Belgium    Europe  1957  69.240  8989111  9714.961
## 10     Myanmar      Asia  1967  49.379 25870271   349.000
## # ... with 842 more rows

sample_n – sample n rows

set.seed(2016)
tiny <- sample_n(gapminder, 3)
tiny

## # A tibble: 3 × 6
##     country continent  year lifeExp      pop gdpPercap
##      <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>
## 1  Colombia  Americas  1982  66.653 27764644  4397.576
## 2    Canada  Americas  1967  72.130 20819767 16076.588
## 3 Sri Lanka      Asia  1972  65.042 13016733  1213.396

rename – rename columns

rename(tiny, GDP = gdpPercap, population = pop)

## # A tibble: 3 × 6
##     country continent  year lifeExp population       GDP
##      <fctr>    <fctr> <int>   <dbl>      <int>     <dbl>
## 1  Colombia  Americas  1982  66.653   27764644  4397.576
## 2    Canada  Americas  1967  72.130   20819767 16076.588
## 3 Sri Lanka      Asia  1972  65.042   13016733  1213.396

select – select columns from the data frame

select(tiny, starts_with("y"), pop, matches("^co.*"))

## # A tibble: 3 × 4
##    year      pop   country continent
##   <int>    <int>    <fctr>    <fctr>
## 1  1982 27764644  Colombia  Americas
## 2  1967 20819767    Canada  Americas
## 3  1972 13016733 Sri Lanka      Asia

filter – select rows from the data frame, producing a subset

# filter(tiny, lifeExp > 60 & year < 1980)
# or equivalent:
filter(tiny, lifeExp > 60, year < 1980)

## # A tibble: 2 × 6
##     country continent  year lifeExp      pop gdpPercap
##      <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>
## 1    Canada  Americas  1967  72.130 20819767 16076.588
## 2 Sri Lanka      Asia  1972  65.042 13016733  1213.396

slice – select rows from data frame by index, producing a subset

slice(gapminder, 300:303)

## # A tibble: 4 × 6
##    country continent  year lifeExp        pop gdpPercap
##     <fctr>    <fctr> <int>   <dbl>      <int>     <dbl>
## 1    China      Asia  2007  72.961 1318683096  4959.115
## 2 Colombia  Americas  1952  50.643   12350771  2144.115
## 3 Colombia  Americas  1957  55.118   14485993  2323.806
## 4 Colombia  Americas  1962  57.863   17009885  2492.351

mutate – add new columns that can be functions of existing columns

mutate(tiny, newVar = (lifeExp / gdpPercap), newcol = 3:1)

## # A tibble: 3 × 8
##     country continent  year lifeExp      pop gdpPercap      newVar
##      <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>       <dbl>
## 1  Colombia  Americas  1982  66.653 27764644  4397.576 0.015156760
## 2    Canada  Americas  1967  72.130 20819767 16076.588 0.004486649
## 3 Sri Lanka      Asia  1972  65.042 13016733  1213.396 0.053603296
## # ... with 1 more variables: newcol <int>

tiny <- mutate(tiny, newVar = (lifeExp / gdpPercap), newVar2 = newVar^2)
glimpse(tiny)

## Observations: 3
## Variables: 8
## $ country   <fctr> Colombia, Canada, Sri Lanka
## $ continent <fctr> Americas, Americas, Asia
## $ year      <int> 1982, 1967, 1972
## $ lifeExp   <dbl> 66.653, 72.130, 65.042
## $ pop       <int> 27764644, 20819767, 13016733
## $ gdpPercap <dbl> 4397.576, 16076.588, 1213.396
## $ newVar    <dbl> 0.015156760, 0.004486649, 0.053603296
## $ newVar2   <dbl> 2.297274e-04, 2.013002e-05, 2.873313e-03

transmute – add new columns that can be functions of the existing columns, and drop the existing columns

tiny <- transmute(tiny, id = 1:3, country, continent,
                  newVarSqrt = sqrt(newVar), pop)
tiny

## # A tibble: 3 × 5
##      id   country continent newVarSqrt      pop
##   <int>    <fctr>    <fctr>      <dbl>    <int>
## 1     1  Colombia  Americas 0.12311279 27764644
## 2     2    Canada  Americas 0.06698245 20819767
## 3     3 Sri Lanka      Asia 0.23152386 13016733

arrange – reorder rows

arrange(tiny, pop)

## # A tibble: 3 × 5
##      id   country continent newVarSqrt      pop
##   <int>    <fctr>    <fctr>      <dbl>    <int>
## 1     3 Sri Lanka      Asia 0.23152386 13016733
## 2     2    Canada  Americas 0.06698245 20819767
## 3     1  Colombia  Americas 0.12311279 27764644

arrange(gapminder, desc(year), lifeExp)

## # A tibble: 1,704 × 6
##                     country continent  year lifeExp      pop
##                      <fctr>    <fctr> <int>   <dbl>    <int>
## 1                 Swaziland    Africa  2007  39.613  1133066
## 2                Mozambique    Africa  2007  42.082 19951656
## 3                    Zambia    Africa  2007  42.384 11746035
## 4              Sierra Leone    Africa  2007  42.568  6144562
## 5                   Lesotho    Africa  2007  42.592  2012649
## 6                    Angola    Africa  2007  42.731 12420476
## 7                  Zimbabwe    Africa  2007  43.487 12311143
## 8               Afghanistan      Asia  2007  43.828 31889923
## 9  Central African Republic    Africa  2007  44.741  4369038
## 10                  Liberia    Africa  2007  45.678  3193942
## # ... with 1,694 more rows, and 1 more variables: gdpPercap <dbl>

summarize – create collapsed summaries of a data frame by applying functions to columns

summarize(gapminder, aveLife = mean(lifeExp))

## # A tibble: 1 × 1
##    aveLife
##      <dbl>
## 1 59.47444

distinct – find distinct rows, for repetitive data

tiny2 <- tiny[c(1,1,2,2), ]
dim(tiny2)

## [1] 4 5

distinct(tiny2)

## # A tibble: 2 × 5
##      id  country continent newVarSqrt      pop
##   <int>   <fctr>    <fctr>      <dbl>    <int>
## 1     1 Colombia  Americas 0.12311279 27764644
## 2     2   Canada  Americas 0.06698245 20819767

n_distinct(tiny2)

## [1] 2

Chaining

Base-R-style

set.seed(2016)
sample_n(filter(gapminder, continent == "Asia" & lifeExp < 65), 2)

## # A tibble: 2 × 6
##    country continent  year  lifeExp       pop gdpPercap
##     <fctr>    <fctr> <int>    <dbl>     <int>     <dbl>
## 1    China      Asia  1957 50.54896 637408000   575.987
## 2 Cambodia      Asia  1972 40.31700   7450606   421.624

Using pipes

set.seed(2016)
gapminder %>% filter(continent == "Asia") %>% filter(lifeExp < 65) %>% sample_n(2)

## # A tibble: 2 × 6
##    country continent  year  lifeExp       pop gdpPercap
##     <fctr>    <fctr> <int>    <dbl>     <int>     <dbl>
## 1    China      Asia  1957 50.54896 637408000   575.987
## 2 Cambodia      Asia  1972 40.31700   7450606   421.624

More verbs

group_by – convert the data frame into a grouped data frame, where the operations are performed by group

gapminder %>% group_by(continent) %>%
  summarize(aveLife = mean(lifeExp), count = n(),
            countries = n_distinct(country))

## # A tibble: 5 × 4
##   continent  aveLife count countries
##      <fctr>    <dbl> <int>     <int>
## 1    Africa 48.86533   624        52
## 2  Americas 64.65874   300        25
## 3      Asia 60.06490   396        33
## 4    Europe 71.90369   360        30
## 5   Oceania 74.32621    24         2

gapminder %>% group_by(continent) %>% tally

## # A tibble: 5 × 2
##   continent     n
##      <fctr> <int>
## 1    Africa   624
## 2  Americas   300
## 3      Asia   396
## 4    Europe   360
## 5   Oceania    24

Join multiple data frames

Example originally from http://stat545.com/bit001_dplyr-cheatsheet.html

superheroes <- c("name, alignment, gender, publisher",
                 "Magneto, bad, male, Marvel",
                 "Storm, good, female, Marvel",
                 "Mystique, bad, female, Marvel",
                 "Batman, good, male, DC",
                 "Joker, bad, male, DC",
                 "Catwoman, bad, female, DC",
                 "Hellboy, good, male, Dark Horse Comics")
superheroes <- read.csv(text = superheroes, strip.white = TRUE, as.is=TRUE)
publishers <- c("publisher, yr_founded",
                "       DC, 1934",
                "   Marvel, 1939",
                "    Image, 1992")
publishers <- read.csv(text = publishers, strip.white = TRUE, as.is=TRUE)

Inner vs left vs full join

inner_join

inner_join(superheroes, publishers)

## Joining, by = "publisher"

##       name alignment gender publisher yr_founded
## 1  Magneto       bad   male    Marvel       1939
## 2    Storm      good female    Marvel       1939
## 3 Mystique       bad female    Marvel       1939
## 4   Batman      good   male        DC       1934
## 5    Joker       bad   male        DC       1934
## 6 Catwoman       bad female        DC       1934

left_join

left_join(superheroes, publishers)

## Joining, by = "publisher"

##       name alignment gender         publisher yr_founded
## 1  Magneto       bad   male            Marvel       1939
## 2    Storm      good female            Marvel       1939
## 3 Mystique       bad female            Marvel       1939
## 4   Batman      good   male                DC       1934
## 5    Joker       bad   male                DC       1934
## 6 Catwoman       bad female                DC       1934
## 7  Hellboy      good   male Dark Horse Comics         NA

full_join

full_join(superheroes, publishers)

## Joining, by = "publisher"

##       name alignment gender         publisher yr_founded
## 1  Magneto       bad   male            Marvel       1939
## 2    Storm      good female            Marvel       1939
## 3 Mystique       bad female            Marvel       1939
## 4   Batman      good   male                DC       1934
## 5    Joker       bad   male                DC       1934
## 6 Catwoman       bad female                DC       1934
## 7  Hellboy      good   male Dark Horse Comics         NA
## 8     <NA>      <NA>   <NA>             Image       1992