I have the following dataset,
my_df <- data.frame(KPI_ID=c('DAD0012012', 'DAD0012013', 'DAE0012012', 'DAE0012013',
'DAE0012014', 'DAE0012015', 'DAF0012012'),
year=c(2015, 2016, 2012, 2013, 2014, 2015, 2012),
code_id=c('DAD001', 'DAD001', 'DAE001', 'DAE001', 'DAE001', 'DAE001', 'DAF001'),
region=c('Europe and Central Asia', 'Europe and Central Asia',
'Middle East and North Africa', 'Middle East and North Africa',
'Middle East and North Africa', 'Middle East and North Africa',
'Asia Pacific'),
country=c('Andorra', 'Andorra', 'United Arab Emirates', 'United Arab Emirates',
'United Arab Emirates', 'United Arab Emirates', 'Afghanistan'),
supported1=c("", "DAE001,DAF001", "DAD001,DAF001", "DAF001", "", "DAF001,DAD001", "DAD001"),
received_support1=c("DAE001","","DAD001,DAF001","DAD001,DAF001","DAD001,DAF001","DAD001",
"DAD001,DAE001"))
Which contains "cells" with two or more elements separated by ",". I am trying to split them and add a row below accordingly (see below desired df).
desired_df <- data.frame(KPI_ID=c('DAD0012012', 'DAD0012013', 'DAD0012013',
'DAE0012012','DAE0012012', 'DAE0012013','DAE0012013','DAE0012014','DAE0012014','DAE0012015','DAE0012015',
'DAE0012012','DAE0012012'),
year=c(2015, 2016,2016,
2012,2012, 2013,2013,2014,2014,2015,2015,
2012,2012),
code_id=c('DAD001', 'DAD001', 'DAD001',
'DAE001','DAE001', 'DAE001','DAE001','DAE001','DAE001','DAE001','DAE001',
"DAF001","DAF001"),
region=c('Europe and Central Asia', 'Europe and Central Asia','Europe and Central Asia',
'Middle East and North Africa','Middle East and North Africa','Middle East and North Africa', 'Middle East and North Africa','Middle East and North Africa','Middle East and North Africa', 'Middle East and North Africa', 'Middle East and North Africa',
'Asia Pacific','Asia Pacific'),
country=c('Andorra', 'Andorra', 'Andorra',
'United Arab Emirates', 'United Arab Emirates', 'United Arab Emirates','United Arab Emirates','United Arab Emirates','United Arab Emirates', 'United Arab Emirates', 'United Arab Emirates',
'Afghanistan','Afghanistan'),
supported1=c("", "DAE001,DAF001", "DAE001,DAF001",
"DAD001,DAF001", "DAD001,DAF001","DAF001","DAF001","","","DAF001,DAD001","DAF001,DAD001",
"DAD001","DAD001"),
received_support1=c("DAE001","","",
"DAD001,DAF001","DAD001,DAF001","DAD001,DAF001","DAD001,DAF001","DAD001,DAF001","DAD001,DAF001","DAD001","DAD001",
"DAD001,DAE001","DAD001,DAE001"),
support_to=c("","United Arab Emirates","Afghanistan",
"Andorra","Afghanistan","Afghanistan","","","","Afghanistan","Andorra",
"Andorra",""),
received_support_from=c("United Arab Emirates","","",
"Andorra","Afghanistan","Andorra","Afghanistan","Andorra","Afghanistan","Andorra","",
"Andorra","United Arab Emirates"))
CodePudding user response:
You can try
library(tidyverse)
separate_rows(my_df, supported1, received_support1, sep=",") %>%
left_join(distinct(my_df, code_id, country), by=c("supported1" = "code_id")) %>%
left_join(distinct(my_df, code_id, country), by=c("received_support1" = "code_id"))
CodePudding user response:
Using separate_rows
like @Roman mentioned in the comments:
separate_rows(my_df, supported1, received_support1, sep = "\\,")
Output:
# A tibble: 13 × 7
KPI_ID year code_id region country supported1 received_support1
<chr> <dbl> <chr> <chr> <chr> <chr> <chr>
1 DAD0012012 2015 DAD001 Europe and Central Asia Andorra "" "DAE001"
2 DAD0012013 2016 DAD001 Europe and Central Asia Andorra "DAE001" ""
3 DAD0012013 2016 DAD001 Europe and Central Asia Andorra "DAF001" ""
4 DAE0012012 2012 DAE001 Middle East and North Africa United Arab Emirates "DAD001" "DAD001"
5 DAE0012012 2012 DAE001 Middle East and North Africa United Arab Emirates "DAF001" "DAF001"
6 DAE0012013 2013 DAE001 Middle East and North Africa United Arab Emirates "DAF001" "DAD001"
7 DAE0012013 2013 DAE001 Middle East and North Africa United Arab Emirates "DAF001" "DAF001"
8 DAE0012014 2014 DAE001 Middle East and North Africa United Arab Emirates "" "DAD001"
9 DAE0012014 2014 DAE001 Middle East and North Africa United Arab Emirates "" "DAF001"
10 DAE0012015 2015 DAE001 Middle East and North Africa United Arab Emirates "DAF001" "DAD001"
11 DAE0012015 2015 DAE001 Middle East and North Africa United Arab Emirates "DAD001" "DAD001"
12 DAF0012012 2012 DAF001 Asia Pacific Afghanistan "DAD001" "DAD001"
13 DAF0012012 2012 DAF001 Asia Pacific Afghanistan "DAD001" "DAE001"
Old answer:
An option using twice separate_rows
:
library(tidyr)
library(dplyr)
my_df %>%
separate_rows(supported1, sep = "\\,") %>%
separate_rows(received_support1, sep = "\\,")