Date last run: 23Mar2020
I saw that Jane Wall created some tidyr lessons on YouTube.
In this blog entry I try to ‘remember’ the things she teaches us and what I have picked up during these lessons.
Load the packages that will be used
HOQCutil::silent_library(c('dplyr','tidyr','tibble'))
Load two example tables and make them ‘longer’
The prefix tidyr::
is not necessary if the package tidyr
is loaded (as it is here) but I think it more readable to indicate what the source of the data is: otherwise table4a
comes just out of thin air. I also use the prefix to indicate where the function pivot_longer
comes from.
After converting both tables to ‘long’ format they are combined by full_join
in the default way: as indicated in the output (Joining, by = c("country", "year")
) by joining rows with equal ‘country’ and ‘year’.
(tabl4a=tidyr::table4a )
#> # A tibble: 3 x 3
#> country `1999` `2000`
#> * <chr> <int> <int>
#> 1 Afghanistan 745 2666
#> 2 Brazil 37737 80488
#> 3 China 212258 213766
tidy4a=tidyr::pivot_longer(tabl4a,names_to="year",
cols=c(`1999`,`2000`),values_to = "cases")
print(tidy4a)
#> # A tibble: 6 x 3
#> country year cases
#> <chr> <chr> <int>
#> 1 Afghanistan 1999 745
#> 2 Afghanistan 2000 2666
#> 3 Brazil 1999 37737
#> 4 Brazil 2000 80488
#> 5 China 1999 212258
#> 6 China 2000 213766
(tabl4b=tidyr::table4b)
#> # A tibble: 3 x 3
#> country `1999` `2000`
#> * <chr> <int> <int>
#> 1 Afghanistan 19987071 20595360
#> 2 Brazil 172006362 174504898
#> 3 China 1272915272 1280428583
tidy4b=tidyr::pivot_longer(tabl4b,names_to="year",
cols=c(`1999`,`2000`),values_to = "population")
print(tidy4b)
#> # A tibble: 6 x 3
#> country year population
#> <chr> <chr> <int>
#> 1 Afghanistan 1999 19987071
#> 2 Afghanistan 2000 20595360
#> 3 Brazil 1999 172006362
#> 4 Brazil 2000 174504898
#> 5 China 1999 1272915272
#> 6 China 2000 1280428583
dplyr::full_join(tidy4a,tidy4b)
#> Joining, by = c("country", "year")
#> # A tibble: 6 x 4
#> country year cases population
#> <chr> <chr> <int> <int>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583
Load two example tables and make them ‘wider’
In the second example with fish_encounters
we use values_fill = c('seen'=0)
to return a zero in a cell that would otherwise be NA
because a certain fish was not spotted in a certain station.
(tabl2 = tidyr::table2 )
#> # A tibble: 12 x 4
#> country year type count
#> <chr> <int> <chr> <int>
#> 1 Afghanistan 1999 cases 745
#> 2 Afghanistan 1999 population 19987071
#> 3 Afghanistan 2000 cases 2666
#> 4 Afghanistan 2000 population 20595360
#> 5 Brazil 1999 cases 37737
#> 6 Brazil 1999 population 172006362
#> 7 Brazil 2000 cases 80488
#> 8 Brazil 2000 population 174504898
#> 9 China 1999 cases 212258
#> 10 China 1999 population 1272915272
#> 11 China 2000 cases 213766
#> 12 China 2000 population 1280428583
tidy2=tidyr::pivot_wider(tabl2,id_cols=c('country','year'),
names_from = type,values_from = count)
print(tidy2)
#> # A tibble: 6 x 4
#> country year cases population
#> <chr> <int> <int> <int>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583
tidyr::pivot_wider(fish_encounters,id_cols=c('fish'),
names_from = station,values_from = seen,
values_fill = c('seen'=0) )
#> # A tibble: 19 x 12
#> fish Release I80_1 Lisbon Rstr Base_TD BCE BCW BCE2 BCW2 MAE MAW
#> <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 4842 1 1 1 1 1 1 1 1 1 1 1
#> 2 4843 1 1 1 1 1 1 1 1 1 1 1
#> 3 4844 1 1 1 1 1 1 1 1 1 1 1
#> 4 4845 1 1 1 1 1 0 0 0 0 0 0
#> 5 4847 1 1 1 0 0 0 0 0 0 0 0
#> 6 4848 1 1 1 1 0 0 0 0 0 0 0
#> 7 4849 1 1 0 0 0 0 0 0 0 0 0
#> 8 4850 1 1 0 1 1 1 1 0 0 0 0
#> 9 4851 1 1 0 0 0 0 0 0 0 0 0
#> 10 4854 1 1 0 0 0 0 0 0 0 0 0
#> 11 4855 1 1 1 1 1 0 0 0 0 0 0
#> 12 4857 1 1 1 1 1 1 1 1 1 0 0
#> 13 4858 1 1 1 1 1 1 1 1 1 1 1
#> 14 4859 1 1 1 1 1 0 0 0 0 0 0
#> 15 4861 1 1 1 1 1 1 1 1 1 1 1
#> 16 4862 1 1 1 1 1 1 1 1 1 0 0
#> 17 4863 1 1 0 0 0 0 0 0 0 0 0
#> 18 4864 1 1 0 0 0 0 0 0 0 0 0
#> 19 4865 1 1 1 0 0 0 0 0 0 0 0
Load table and make wider with a ‘spec’
Working with specw
as originally specified works fine.
However when I restrict it with a filter it will not work unless a summary function is used.
This could work as designed. See RStudio Community question .
specw = tidyr::build_wider_spec(fish_encounters,
names_from = station,values_from = seen
)
specw = specw %>%
filter (!.name %in% c('Release','Lisbon') )
tidyr::pivot_wider_spec(fish_encounters,specw,id_cols=c('fish'),
values_fill = c('seen'=0L)
,values_fn = list(seen = max)
)
#> # A tibble: 19 x 10
#> fish I80_1 Rstr Base_TD BCE BCW BCE2 BCW2 MAE MAW
#> <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 4842 1 1 1 1 1 1 1 1 1
#> 2 4843 1 1 1 1 1 1 1 1 1
#> 3 4844 1 1 1 1 1 1 1 1 1
#> 4 4845 1 1 1 0 0 0 0 0 0
#> 5 4847 1 0 0 0 0 0 0 0 0
#> 6 4848 1 1 0 0 0 0 0 0 0
#> 7 4849 1 0 0 0 0 0 0 0 0
#> 8 4850 1 1 1 1 1 0 0 0 0
#> 9 4851 1 0 0 0 0 0 0 0 0
#> 10 4854 1 0 0 0 0 0 0 0 0
#> 11 4855 1 1 1 0 0 0 0 0 0
#> 12 4857 1 1 1 1 1 1 1 0 0
#> 13 4858 1 1 1 1 1 1 1 1 1
#> 14 4859 1 1 1 0 0 0 0 0 0
#> 15 4861 1 1 1 1 1 1 1 1 1
#> 16 4862 1 1 1 1 1 1 1 0 0
#> 17 4863 1 0 0 0 0 0 0 0 0
#> 18 4864 1 0 0 0 0 0 0 0 0
#> 19 4865 1 0 0 0 0 0 0 0 0
Separate and unite
In the ‘separate’ example we use convert=T
so that the character components of rate
are converted to integers.
In the ‘unite’ example we see that the result is a character (the result of with(tidy3a,paste(century,year,sep=''))
). To get the answer we want, we have to format the year with two characters, do the ‘unite’ and then convert to the character result to numeric.
(tabl3 = tidyr::table3)
#> # A tibble: 6 x 3
#> country year rate
#> * <chr> <int> <chr>
#> 1 Afghanistan 1999 745/19987071
#> 2 Afghanistan 2000 2666/20595360
#> 3 Brazil 1999 37737/172006362
#> 4 Brazil 2000 80488/174504898
#> 5 China 1999 212258/1272915272
#> 6 China 2000 213766/1280428583
tidy3a = tabl3 %>%
tidyr::separate(col=rate,into=c('cases','population'),sep='/',convert=T) %>%
tidyr::separate(col=year,into=c('century','year'),sep=2,convert=T)
print(tidy3a)
#> # A tibble: 6 x 5
#> country century year cases population
#> <chr> <int> <int> <int> <int>
#> 1 Afghanistan 19 99 745 19987071
#> 2 Afghanistan 20 0 2666 20595360
#> 3 Brazil 19 99 37737 172006362
#> 4 Brazil 20 0 80488 174504898
#> 5 China 19 99 212258 1272915272
#> 6 China 20 0 213766 1280428583
tabl3a = tidy3a %>%
tidyr::unite(col='year',century,year,sep='',remove=T)
head(tabl3a)
#> # A tibble: 6 x 4
#> country year cases population
#> <chr> <chr> <int> <int>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 200 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 200 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 200 213766 1280428583
# table3a not as expected and therefore
tabl3b = tidy3a %>%
mutate(year=sprintf("%02.0f",year)) %>%
tidyr::unite(col='year',century,year,sep='',remove=T) %>%
mutate(year=as.numeric(year))
head(tabl3b)
#> # A tibble: 6 x 4
#> country year cases population
#> <chr> <dbl> <int> <int>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583
Missing values: complete, fill
fill
and replace_na
only handle explicit NA
-s.
stocks <- tibble::tibble(
year = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
qtr = c(1, 2, 3, 4, 2, 3, 4),
return = c(1.88, 0.59, 0.35, NA, 0.92, 0.17, 2.66)
)
print(stocks)
#> # A tibble: 7 x 3
#> year qtr return
#> <dbl> <dbl> <dbl>
#> 1 2015 1 1.88
#> 2 2015 2 0.59
#> 3 2015 3 0.35
#> 4 2015 4 NA
#> 5 2016 2 0.92
#> 6 2016 3 0.17
#> 7 2016 4 2.66
stocks %>%
tidyr::complete(year,qtr)
#> # A tibble: 8 x 3
#> year qtr return
#> <dbl> <dbl> <dbl>
#> 1 2015 1 1.88
#> 2 2015 2 0.59
#> 3 2015 3 0.35
#> 4 2015 4 NA
#> 5 2016 1 NA
#> 6 2016 2 0.92
#> 7 2016 3 0.17
#> 8 2016 4 2.66
stocks %>%
tidyr::complete(year,qtr) %>%
tidyr::fill(return,.direction='down') # or 'up' or 'updown' or 'downup'
#> # A tibble: 8 x 3
#> year qtr return
#> <dbl> <dbl> <dbl>
#> 1 2015 1 1.88
#> 2 2015 2 0.59
#> 3 2015 3 0.35
#> 4 2015 4 0.35
#> 5 2016 1 0.35
#> 6 2016 2 0.92
#> 7 2016 3 0.17
#> 8 2016 4 2.66
df <- tibble::tibble(
group = c(1, 2, 1),
item_id = c(1, 2, 2),
item_name = c('a', 'b', 'b'),
value1 = 1:3,
value2 = 4:6
)
print(df)
#> # A tibble: 3 x 5
#> group item_id item_name value1 value2
#> <dbl> <dbl> <chr> <int> <int>
#> 1 1 1 a 1 4
#> 2 2 2 b 2 5
#> 3 1 2 b 3 6
df %>%
tidyr::complete(group,nesting(item_id,item_name),
fill=list(value1=0,value2='unknown'))
#> # A tibble: 4 x 5
#> group item_id item_name value1 value2
#> <dbl> <dbl> <chr> <dbl> <chr>
#> 1 1 1 a 1 4
#> 2 1 2 b 3 6
#> 3 2 1 a 0 unknown
#> 4 2 2 b 2 5
df %>%
tidyr::complete(group,nesting(item_id,item_name)) %>%
tidyr::replace_na(list(value1=0,value2='unknown'))
#> # A tibble: 4 x 5
#> group item_id item_name value1 value2
#> <dbl> <dbl> <chr> <dbl> <chr>
#> 1 1 1 a 1 4
#> 2 1 2 b 3 6
#> 3 2 1 a 0 unknown
#> 4 2 2 b 2 5
stocks %>%
tidyr::pivot_wider(
names_from = year,
values_from = return
) %>%
tidyr::pivot_longer(
cols=c(`2015`, `2016`) ,
names_to = 'year',
values_to = 'return',
values_drop_na = T
)
#> # A tibble: 6 x 3
#> qtr year return
#> <dbl> <chr> <dbl>
#> 1 1 2015 1.88
#> 2 2 2015 0.59
#> 3 2 2016 0.92
#> 4 3 2015 0.35
#> 5 3 2016 0.17
#> 6 4 2016 2.66
tidyr names_pattern
Use names_pattern
where the column names contain data.
preg <- tibble::tibble(
name = c('John Smith','Jane Doe','Mary Johnson'),
treatmenta = c(NA, 4, 6),
treatmentb = c(18, 1, 7)
)
print(preg)
#> # A tibble: 3 x 3
#> name treatmenta treatmentb
#> <chr> <dbl> <dbl>
#> 1 John Smith NA 18
#> 2 Jane Doe 4 1
#> 3 Mary Johnson 6 7
preg %>%
tidyr::pivot_longer(
cols = starts_with("treatment"),
names_to = "treatment",
values_to = "result",
names_pattern = "treatment(.)"
)
#> # A tibble: 6 x 3
#> name treatment result
#> <chr> <chr> <dbl>
#> 1 John Smith a NA
#> 2 John Smith b 18
#> 3 Jane Doe a 4
#> 4 Jane Doe b 1
#> 5 Mary Johnson a 6
#> 6 Mary Johnson b 7
tidyr::who %>% # Data from World Health Organisation
tidyr::pivot_longer(
# cols = starts_with("new"), # my column selection
cols =new_sp_m014:newrel_f65,
names_to = c("diagnosis", "gender","agegroup"),
values_to = "count",
values_drop_na = T,
# names_pattern = "new_*(.+)_([fm])(.+)" # my regex
names_pattern = "new_?(.*)_(.)(.*)",
names_ptypes = list(
gender = factor(levels = c("f", "m")),
agegroup = factor (
levels = c("014", "1524", "2534", "3544", "4554", "5564","65"),
ordered = T
)
)
)
#> # A tibble: 76,046 x 8
#> country iso2 iso3 year diagnosis gender agegroup count
#> <chr> <chr> <chr> <int> <chr> <fct> <ord> <int>
#> 1 Afghanistan AF AFG 1997 sp m 014 0
#> 2 Afghanistan AF AFG 1997 sp m 1524 10
#> 3 Afghanistan AF AFG 1997 sp m 2534 6
#> 4 Afghanistan AF AFG 1997 sp m 3544 3
#> 5 Afghanistan AF AFG 1997 sp m 4554 5
#> 6 Afghanistan AF AFG 1997 sp m 5564 2
#> 7 Afghanistan AF AFG 1997 sp m 65 0
#> 8 Afghanistan AF AFG 1997 sp f 014 5
#> 9 Afghanistan AF AFG 1997 sp f 1524 38
#> 10 Afghanistan AF AFG 1997 sp f 2534 36
#> # ... with 76,036 more rows
weather = read.csv(
paste0("https://raw.githubusercontent.com/",
"JaneWall/STAT412_612_online/master/",
"Week%208%20tidyr/Data/weather.csv"))
# names(weather)
# tibble::glimpse(weather)
weather %>%
tidyr::pivot_longer(
cols =d1:d31,
names_to = "day",
values_to = "value",
values_drop_na = T,
names_pattern = "d(.*)"
)
#> # A tibble: 66 x 6
#> id year month element day value
#> <fct> <int> <int> <fct> <chr> <dbl>
#> 1 MX17004 2010 1 tmax 30 27.8
#> 2 MX17004 2010 1 tmin 30 14.5
#> 3 MX17004 2010 2 tmax 2 27.3
#> 4 MX17004 2010 2 tmax 3 24.1
#> 5 MX17004 2010 2 tmax 11 29.7
#> 6 MX17004 2010 2 tmax 23 29.9
#> 7 MX17004 2010 2 tmin 2 14.4
#> 8 MX17004 2010 2 tmin 3 14.4
#> 9 MX17004 2010 2 tmin 11 13.4
#> 10 MX17004 2010 2 tmin 23 10.7
#> # ... with 56 more rows
tidyr multiple obs per row and hand specs
tibble::tribble(
~ family, ~ dob_child1, ~ dob_child2, ~ gender_child1, ~ gender_child2,
1L, "1988-11-26", "2011-01-29", 1L, 2L,
2L, "1996-06-22", NA, 2L, NA
) %>%
mutate_at(vars(starts_with("dob")), lubridate::ymd) %>%
tidyr::pivot_longer(
cols = -family,
names_to = c(".value", "child"),
names_sep = "_",
values_drop_na = TRUE
)
#> # A tibble: 3 x 4
#> family child dob gender
#> <int> <chr> <date> <int>
#> 1 1 child1 1988-11-26 1
#> 2 1 child2 2011-01-29 2
#> 3 2 child1 1996-06-22 2
##
wbp <- world_bank_pop %>%
tidyr::pivot_longer(cols = `2000`:`2017`,
names_to = 'year',
values_to = 'value')
wbp1 <- wbp %>%
tidyr::pivot_wider(names_from = indicator,
values_from = 'value')
wbp2 <- wbp1 %>%
tidyr::pivot_longer(
cols = `SP.URB.TOTL`:`SP.POP.GROW`,
names_to = c("size", "type"),
names_pattern = "SP\\.(.+)\\.(.+)",
values_to = "value"
)
Session Info
This document was produced on 23Mar2020 with the following R environment:
#> R version 3.6.0 (2019-04-26)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 18363)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=English_United States.1252
#> [2] LC_CTYPE=English_United States.1252
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.1252
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] tibble_2.1.3 tidyr_1.0.2.9000 dplyr_0.8.5
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.4 knitr_1.28 magrittr_1.5 tidyselect_1.0.0
#> [5] HOQCutil_0.1.19 R6_2.4.1 rlang_0.4.5 fansi_0.4.1
#> [9] stringr_1.4.0 tools_3.6.0 xfun_0.10 utf8_1.1.4
#> [13] cli_2.0.2 ellipsis_0.3.0 htmltools_0.4.0 digest_0.6.25
#> [17] assertthat_0.2.1 lifecycle_0.2.0 crayon_1.3.4 purrr_0.3.3
#> [21] vctrs_0.2.4 glue_1.3.2 evaluate_0.14 rmarkdown_2.1
#> [25] stringi_1.4.6 compiler_3.6.0 pillar_1.4.3 lubridate_1.7.4
#> [29] pkgconfig_2.0.3