19 Wide-to-long tables

author: Юрій Клебан

Before start load packages

library(tidyr)

Some times your data is not in tidy format. Peole can collect data year by year in each column. It’s problem to use such data for feature engeniering and building prediction models. Let’s generate such data sample (quaterly salary of some people).

not_good_data <- data.frame(Name = c("Nick", "Jake", "Anna", "Jane", "Dina"),
                           q1_2021 = c(12442, 22131, 21343, 22111, 14123),
                           q2_2021 = c(13442, 22871, 20343, 22222, 14456),
                           q3_2021 = c(15482, 22031, 22456, 22444, 14533),
                           q4_2021 = c(14511, 20031, 21741, 22333, 14511))
not_good_data

A data.frame: 5 × 5
Name	q1_2021	q2_2021	q3_2021	q4_2021
<chr>	<dbl>	<dbl>	<dbl>	<dbl>
Nick	12442	13442	15482	14511
Jake	22131	22871	22031	20031
Anna	21343	20343	22456	21741
Jane	22111	22222	22444	22333
Dina	14123	14456	14533	14511

better_data <- not_good_data %>%
                gather(quater, salary, 2:5)
                # gather(quater, salary, q1_2021:q4_2021) possible code too
better_data

A data.frame: 20 × 3
Name	quater	salary
<chr>	<chr>	<dbl>
Nick	q1_2021	12442
Jake	q1_2021	22131
Anna	q1_2021	21343
Jane	q1_2021	22111
Dina	q1_2021	14123
Nick	q2_2021	13442
Jake	q2_2021	22871
Anna	q2_2021	20343
Jane	q2_2021	22222
Dina	q2_2021	14456
Nick	q3_2021	15482
Jake	q3_2021	22031
Anna	q3_2021	22456
Jane	q3_2021	22444
Dina	q3_2021	14533
Nick	q4_2021	14511
Jake	q4_2021	20031
Anna	q4_2021	21741
Jane	q4_2021	22333
Dina	q4_2021	14511

To make our data tidier separate() can split quater column into 2 (quater and year):

best_data <- better_data %>%
    separate(quater, c("quater", "year"), sep = "_") %>% # separate
    mutate(year = as.integer(year), # convert year to integer
           quater = substr(better_data$quater, 2,2), # trim `q` from start
           quater = as.integer(quater), # convert quater to integer
          ) %>%
    head(10)
best_data

A data.frame: 10 × 4
	Name	quater	year	salary
	<chr>	<int>	<int>	<dbl>
1	Nick	1	2021	12442
2	Jake	1	2021	22131
3	Anna	1	2021	21343
4	Jane	1	2021	22111
5	Dina	1	2021	14123
6	Nick	2	2021	13442
7	Jake	2	2021	22871
8	Anna	2	2021	20343
9	Jane	2	2021	22222
10	Dina	2	2021	14456

The unite() function concanates two columns into one:

united_data <- best_data %>%
                unite(Qt, quater, year, sep = "#")
united_data

A data.frame: 10 × 3
	Name	Qt	salary
	<chr>	<chr>	<dbl>
1	Nick	1#2021	12442
2	Jake	1#2021	22131
3	Anna	1#2021	21343
4	Jane	1#2021	22111
5	Dina	1#2021	14123
6	Nick	2#2021	13442
7	Jake	2#2021	22871
8	Anna	2#2021	20343
9	Jane	2#2021	22222
10	Dina	2#2021	14456

# if dont want remove old columns use remove param
united_data <- best_data %>%
                unite(Qt, quater, year, sep = "#", remove = F)
united_data

A data.frame: 10 × 5
	Name	Qt	quater	year	salary
	<chr>	<chr>	<int>	<int>	<dbl>
1	Nick	1#2021	1	2021	12442
2	Jake	1#2021	1	2021	22131
3	Anna	1#2021	1	2021	21343
4	Jane	1#2021	1	2021	22111
5	Dina	1#2021	1	2021	14123
6	Nick	2#2021	2	2021	13442
7	Jake	2#2021	2	2021	22871
8	Anna	2#2021	2	2021	20343
9	Jane	2#2021	2	2021	22222
10	Dina	2#2021	2	2021	14456

If you need to make table like initial use spread() function:

not_good_data2 <- better_data %>%
                    spread(quater, salary)
not_good_data2

A data.frame: 5 × 5
Name	q1_2021	q2_2021	q3_2021	q4_2021
<chr>	<dbl>	<dbl>	<dbl>	<dbl>
Anna	21343	20343	22456	21741
Dina	14123	14456	14533	14511
Jake	22131	22871	22031	20031
Jane	22111	22222	22444	22333
Nick	12442	13442	15482	14511

Let’s try to spread() feild pop of gapminder by year:

library(gapminder)

gapminder %>% select(country, pop, year) %>%
                spread(year, pop) %>%
                head() # for shorter code

# now you can easy send data to your director in excel :)

A tibble: 6 × 13
country	1952	1957	1962	1967	1972	1977	1982	1987	1992	1997	2002	2007
<fct>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
Afghanistan	8425333	9240934	10267083	11537966	13079460	14880372	12881816	13867957	16317921	22227415	25268405	31889923
Albania	1282697	1476505	1728137	1984060	2263554	2509048	2780097	3075321	3326498	3428038	3508512	3600523
Algeria	9279525	10270856	11000948	12760499	14760787	17152804	20033753	23254956	26298373	29072015	31287142	33333216
Angola	4232095	4561361	4826015	5247469	5894858	6162675	7016384	7874230	8735988	9875024	10866106	12420476
Argentina	17876956	19610538	21283783	22934225	24779799	26983828	29341374	31620918	33958947	36203463	38331121	40301927
Australia	8691212	9712569	10794968	11872264	13177000	14074100	15184200	16257249	17481977	18565243	19546792	20434176

Functions table:

Function	Objectives	Arguments
`gather()`	Transform the data from wide to long	(data, key, value, na.rm = FALSE)
`spread()`	Transform the data from long to wide	(data, key, value)
`separate()`	Split one variables into two	(data, col, into, sep= ““, remove = TRUE)
`unite()`	Unite two variables into one	(data, col, conc ,sep= ““, remove = TRUE)

19.1 Refences

dplyr: A Grammar of Data Manipulation on https://cran.r-project.org/.
Data Transformation with splyr::cheat sheet.
DPLYR TUTORIAL : DATA MANIPULATION (50 EXAMPLES) by Deepanshu Bhalla.
Dplyr Intro by Stat 545. 6.R Dplyr Tutorial: Data Manipulation(Join) & Cleaning(Spread). Introduction to Data Analysis
Loan Default Prediction. Beginners data set for financial analytics Kaggle