Welcome, friend :)

In this tutorial, I am going to present you the basic functions in the dplyr package: select(), filter(), mutate(), transmute(), group_by() and summarise(). If you have any doubts, don’t hesitate to contact me on samuelmacedo@recife.ifpe.edu.br.

Let’s get to action…first of all, to install dplyr, please use the commands below:

install.packages("dplyr")
library(dplyr)

Before I start: as_tibble()

Tibble is a modern reimagining of the data.frame. You don’t need to change your data.frame to tibble to use dplyr, but it is strongly recommended. For more information about its properties please check this out.

iris_tbl <- as_tibble(iris)

glimpse()

This function is similar to str().

# see the data frame struct
glimpse(iris_tbl)
## Observations: 150
## Variables: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9,...
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5,...
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1,...
## $ Species      <fct> setosa, setosa, setosa, setosa, setosa, setosa, s...

select()

This function allows you to select the variables you want to work with.

# select just one variable
select(iris_tbl, Petal.Length) 
## # A tibble: 150 x 1
##    Petal.Length
##           <dbl>
##  1          1.4
##  2          1.4
##  3          1.3
##  4          1.5
##  5          1.4
##  6          1.7
##  7          1.4
##  8          1.5
##  9          1.4
## 10          1.5
## # ... with 140 more rows
# you can use commas to select one or more variables
select(iris_tbl, Petal.Length, Petal.Width)  
## # A tibble: 150 x 2
##    Petal.Length Petal.Width
##           <dbl>       <dbl>
##  1          1.4         0.2
##  2          1.4         0.2
##  3          1.3         0.2
##  4          1.5         0.2
##  5          1.4         0.2
##  6          1.7         0.4
##  7          1.4         0.3
##  8          1.5         0.2
##  9          1.4         0.2
## 10          1.5         0.1
## # ... with 140 more rows
# you can exclude variables using minus(-)
select(iris_tbl, -Species) 
## # A tibble: 150 x 4
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          5.1         3.5          1.4         0.2
##  2          4.9         3            1.4         0.2
##  3          4.7         3.2          1.3         0.2
##  4          4.6         3.1          1.5         0.2
##  5          5           3.6          1.4         0.2
##  6          5.4         3.9          1.7         0.4
##  7          4.6         3.4          1.4         0.3
##  8          5           3.4          1.5         0.2
##  9          4.4         2.9          1.4         0.2
## 10          4.9         3.1          1.5         0.1
## # ... with 140 more rows
# with pipe
iris_tbl %>% 
  select(Petal.Length, Petal.Width)
## # A tibble: 150 x 2
##    Petal.Length Petal.Width
##           <dbl>       <dbl>
##  1          1.4         0.2
##  2          1.4         0.2
##  3          1.3         0.2
##  4          1.5         0.2
##  5          1.4         0.2
##  6          1.7         0.4
##  7          1.4         0.3
##  8          1.5         0.2
##  9          1.4         0.2
## 10          1.5         0.1
## # ... with 140 more rows
# you can use select to rearrange the variables
iris_tbl %>% 
  select(Petal.Length, Sepal.Length, 
         Petal.Width, Sepal.Width)
## # A tibble: 150 x 4
##    Petal.Length Sepal.Length Petal.Width Sepal.Width
##           <dbl>        <dbl>       <dbl>       <dbl>
##  1          1.4          5.1         0.2         3.5
##  2          1.4          4.9         0.2         3  
##  3          1.3          4.7         0.2         3.2
##  4          1.5          4.6         0.2         3.1
##  5          1.4          5           0.2         3.6
##  6          1.7          5.4         0.4         3.9
##  7          1.4          4.6         0.3         3.4
##  8          1.5          5           0.2         3.4
##  9          1.4          4.4         0.2         2.9
## 10          1.5          4.9         0.1         3.1
## # ... with 140 more rows

filter()

This function returns only the rows that match some conditions. Let’s see examples!

# for one variable
iris_tbl %>% 
  filter(Sepal.Length > 5)
## Warning: package 'bindrcpp' was built under R version 3.4.4
## # A tibble: 118 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          5.4         3.9          1.7         0.4 setosa 
##  3          5.4         3.7          1.5         0.2 setosa 
##  4          5.8         4            1.2         0.2 setosa 
##  5          5.7         4.4          1.5         0.4 setosa 
##  6          5.4         3.9          1.3         0.4 setosa 
##  7          5.1         3.5          1.4         0.3 setosa 
##  8          5.7         3.8          1.7         0.3 setosa 
##  9          5.1         3.8          1.5         0.3 setosa 
## 10          5.4         3.4          1.7         0.2 setosa 
## # ... with 108 more rows
# for two or more variables you can use a comma (equivalent to the AND operator)
iris_tbl %>% 
  filter(Sepal.Length > 5,  Sepal.Width < 4)
## # A tibble: 114 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          5.4         3.9          1.7         0.4 setosa 
##  3          5.4         3.7          1.5         0.2 setosa 
##  4          5.4         3.9          1.3         0.4 setosa 
##  5          5.1         3.5          1.4         0.3 setosa 
##  6          5.7         3.8          1.7         0.3 setosa 
##  7          5.1         3.8          1.5         0.3 setosa 
##  8          5.4         3.4          1.7         0.2 setosa 
##  9          5.1         3.7          1.5         0.4 setosa 
## 10          5.1         3.3          1.7         0.5 setosa 
## # ... with 104 more rows
# you can filter by strings with "==" operator
iris_tbl %>% 
  filter(Species == "setosa")
## # A tibble: 50 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # ... with 40 more rows

mutate()

This function allows you to make basic operations in the variable, such as: summation, subtraction, multiplications and so on. Mutate can also create new variables in the data frame.

# you can transform a single variable
iris_tbl %>% 
  mutate(Sepal.Length = Sepal.Length * 2)
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1         10.2         3.5          1.4         0.2 setosa 
##  2          9.8         3            1.4         0.2 setosa 
##  3          9.4         3.2          1.3         0.2 setosa 
##  4          9.2         3.1          1.5         0.2 setosa 
##  5         10           3.6          1.4         0.2 setosa 
##  6         10.8         3.9          1.7         0.4 setosa 
##  7          9.2         3.4          1.4         0.3 setosa 
##  8         10           3.4          1.5         0.2 setosa 
##  9          8.8         2.9          1.4         0.2 setosa 
## 10          9.8         3.1          1.5         0.1 setosa 
## # ... with 140 more rows
# you can use commas for two or more trasformations
iris_tbl %>% 
  mutate(Sepal.Length = Sepal.Length * 2,
         Petal.Width = Petal.Width *10)
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1         10.2         3.5          1.4           2 setosa 
##  2          9.8         3            1.4           2 setosa 
##  3          9.4         3.2          1.3           2 setosa 
##  4          9.2         3.1          1.5           2 setosa 
##  5         10           3.6          1.4           2 setosa 
##  6         10.8         3.9          1.7           4 setosa 
##  7          9.2         3.4          1.4           3 setosa 
##  8         10           3.4          1.5           2 setosa 
##  9          8.8         2.9          1.4           2 setosa 
## 10          9.8         3.1          1.5           1 setosa 
## # ... with 140 more rows
# you can create any variables you want
iris_tbl %>% 
  mutate(Lengh_rate = Sepal.Length / Petal.Length,
         Width_rate = Sepal.Width / Petal.Width)
## # A tibble: 150 x 7
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species Lengh_rate
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>        <dbl>
##  1          5.1         3.5          1.4         0.2 setosa        3.64
##  2          4.9         3            1.4         0.2 setosa        3.5 
##  3          4.7         3.2          1.3         0.2 setosa        3.62
##  4          4.6         3.1          1.5         0.2 setosa        3.07
##  5          5           3.6          1.4         0.2 setosa        3.57
##  6          5.4         3.9          1.7         0.4 setosa        3.18
##  7          4.6         3.4          1.4         0.3 setosa        3.29
##  8          5           3.4          1.5         0.2 setosa        3.33
##  9          4.4         2.9          1.4         0.2 setosa        3.14
## 10          4.9         3.1          1.5         0.1 setosa        3.27
## # ... with 140 more rows, and 1 more variable: Width_rate <dbl>

working together: select(), filter(), and mutate().

iris_tbl %>% 
  select(Petal.Length, Petal.Width) %>% 
  filter(Petal.Length > 1.5) %>% 
  mutate(Petal_rate = Petal.Length / Petal.Width)
## # A tibble: 113 x 3
##    Petal.Length Petal.Width Petal_rate
##           <dbl>       <dbl>      <dbl>
##  1          1.7         0.4       4.25
##  2          1.6         0.2       8   
##  3          1.7         0.3       5.67
##  4          1.7         0.2       8.5 
##  5          1.7         0.5       3.4 
##  6          1.9         0.2       9.50
##  7          1.6         0.2       8   
##  8          1.6         0.4       4   
##  9          1.6         0.2       8   
## 10          1.6         0.2       8   
## # ... with 103 more rows

trasmute()

The transmute function also transforms or creates new variables, like mutate(). The difference betewen them is that transnmute() deletes all variables that it didn’t use. In other words, transmute() is like a mutate() plus select(). Let’s check it out!

# mutate() keeps all variables
iris_tbl %>% 
  mutate(Petal.Length = Petal.Length + 1)
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          2.4         0.2 setosa 
##  2          4.9         3            2.4         0.2 setosa 
##  3          4.7         3.2          2.3         0.2 setosa 
##  4          4.6         3.1          2.5         0.2 setosa 
##  5          5           3.6          2.4         0.2 setosa 
##  6          5.4         3.9          2.7         0.4 setosa 
##  7          4.6         3.4          2.4         0.3 setosa 
##  8          5           3.4          2.5         0.2 setosa 
##  9          4.4         2.9          2.4         0.2 setosa 
## 10          4.9         3.1          2.5         0.1 setosa 
## # ... with 140 more rows
# transmute() keeps only the variables used
iris_tbl %>% 
  transmute(Petal.Length = Petal.Length + 1)
## # A tibble: 150 x 1
##    Petal.Length
##           <dbl>
##  1          2.4
##  2          2.4
##  3          2.3
##  4          2.5
##  5          2.4
##  6          2.7
##  7          2.4
##  8          2.5
##  9          2.4
## 10          2.5
## # ... with 140 more rows
# you can create new variables
iris_tbl %>% 
  transmute(Petal_rate = Petal.Length / Petal.Width)
## # A tibble: 150 x 1
##    Petal_rate
##         <dbl>
##  1       7.00
##  2       7.00
##  3       6.5 
##  4       7.5 
##  5       7.00
##  6       4.25
##  7       4.67
##  8       7.5 
##  9       7.00
## 10      15   
## # ... with 140 more rows
# you can trasmumte more than one variable using commas. Notice that the order of the variables in the data.frame is the same as the order written in the code.
iris_tbl %>%
  transmute(Species,
            Petal_rate = Petal.Length / Petal.Width)
## # A tibble: 150 x 2
##    Species Petal_rate
##    <fct>        <dbl>
##  1 setosa        7.00
##  2 setosa        7.00
##  3 setosa        6.5 
##  4 setosa        7.5 
##  5 setosa        7.00
##  6 setosa        4.25
##  7 setosa        4.67
##  8 setosa        7.5 
##  9 setosa        7.00
## 10 setosa       15   
## # ... with 140 more rows

group_by() and summarise()

These two functions work together. The group_by() function only indexes the elements, while summarise() performs the operations. To know the all operations you can use inside summarise(), please look up ?summarise()

# count how many observations for each Species
iris_tbl %>% 
  group_by(Species) %>% 
  summarise(Qtd = n())
## # A tibble: 3 x 2
##   Species      Qtd
##   <fct>      <int>
## 1 setosa        50
## 2 versicolor    50
## 3 virginica     50
# mean of petal length for each species
iris_tbl %>% 
  group_by(Species) %>% 
  summarise(Petal.Lenght_mean = mean(Petal.Length))
## # A tibble: 3 x 2
##   Species    Petal.Lenght_mean
##   <fct>                  <dbl>
## 1 setosa                  1.46
## 2 versicolor              4.26
## 3 virginica               5.55

That’s all folks

Liked it? You can share this tutorial using the buttons below. If you still have any doubts or something to contribute, feel free to contact at samuelmacedo@recife.ifpe.edu.br.

See ya!