Home > Software engineering >  R separate split pivot
R separate split pivot

Time:07-05

I have the following dataset,

my_df <- data.frame(KPI_ID=c('DAD0012012', 'DAD0012013', 'DAE0012012', 'DAE0012013',
                             'DAE0012014', 'DAE0012015', 'DAF0012012'),
                    year=c(2015, 2016, 2012, 2013, 2014, 2015, 2012),
                    code_id=c('DAD001', 'DAD001', 'DAE001', 'DAE001', 'DAE001', 'DAE001', 'DAF001'),
                    region=c('Europe and Central Asia', 'Europe and Central Asia',
                             'Middle East and North Africa', 'Middle East and North Africa',
                             'Middle East and North Africa', 'Middle East and North Africa', 
                             'Asia Pacific'),
                    country=c('Andorra', 'Andorra', 'United Arab Emirates', 'United Arab Emirates',
                          'United Arab Emirates', 'United Arab Emirates', 'Afghanistan'),
                    supported1=c("", "DAE001,DAF001", "DAD001,DAF001", "DAF001", "", "DAF001,DAD001", "DAD001"),
                    received_support1=c("DAE001","","DAD001,DAF001","DAD001,DAF001","DAD001,DAF001","DAD001",
                                        "DAD001,DAE001"))

Which contains "cells" with two or more elements separated by ",". I am trying to split them and add a row below accordingly (see below desired df).

desired_df <- data.frame(KPI_ID=c('DAD0012012', 'DAD0012013', 'DAD0012013',
                             'DAE0012012','DAE0012012', 'DAE0012013','DAE0012013','DAE0012014','DAE0012014','DAE0012015','DAE0012015',
                             'DAE0012012','DAE0012012'),
                    year=c(2015, 2016,2016, 
                           2012,2012, 2013,2013,2014,2014,2015,2015,
                           2012,2012),
                    code_id=c('DAD001', 'DAD001', 'DAD001',
                              'DAE001','DAE001', 'DAE001','DAE001','DAE001','DAE001','DAE001','DAE001',
                              "DAF001","DAF001"),
                    region=c('Europe and Central Asia', 'Europe and Central Asia','Europe and Central Asia',
                             'Middle East and North Africa','Middle East and North Africa','Middle East and North Africa', 'Middle East and North Africa','Middle East and North Africa','Middle East and North Africa', 'Middle East and North Africa', 'Middle East and North Africa',
                             'Asia Pacific','Asia Pacific'),
                    country=c('Andorra', 'Andorra', 'Andorra', 
                              'United Arab Emirates', 'United Arab Emirates', 'United Arab Emirates','United Arab Emirates','United Arab Emirates','United Arab Emirates', 'United Arab Emirates', 'United Arab Emirates',
                              'Afghanistan','Afghanistan'),
                    supported1=c("", "DAE001,DAF001", "DAE001,DAF001", 
                                 "DAD001,DAF001", "DAD001,DAF001","DAF001","DAF001","","","DAF001,DAD001","DAF001,DAD001",
                                 "DAD001","DAD001"),
                    received_support1=c("DAE001","","",
                                        "DAD001,DAF001","DAD001,DAF001","DAD001,DAF001","DAD001,DAF001","DAD001,DAF001","DAD001,DAF001","DAD001","DAD001",
                                        "DAD001,DAE001","DAD001,DAE001"),
                    support_to=c("","United Arab Emirates","Afghanistan",
                                 "Andorra","Afghanistan","Afghanistan","","","","Afghanistan","Andorra",
                                 "Andorra",""),
                    received_support_from=c("United Arab Emirates","","",
                                            "Andorra","Afghanistan","Andorra","Afghanistan","Andorra","Afghanistan","Andorra","",
                                            "Andorra","United Arab Emirates"))

CodePudding user response:

You can try

library(tidyverse)
separate_rows(my_df, supported1, received_support1, sep=",") %>% 
  left_join(distinct(my_df, code_id, country), by=c("supported1" = "code_id")) %>% 
  left_join(distinct(my_df, code_id, country), by=c("received_support1" = "code_id"))

CodePudding user response:

Using separate_rows like @Roman mentioned in the comments:

separate_rows(my_df, supported1, received_support1, sep = "\\,")

Output:

# A tibble: 13 × 7
   KPI_ID      year code_id region                       country              supported1 received_support1
   <chr>      <dbl> <chr>   <chr>                        <chr>                <chr>      <chr>            
 1 DAD0012012  2015 DAD001  Europe and Central Asia      Andorra              ""         "DAE001"         
 2 DAD0012013  2016 DAD001  Europe and Central Asia      Andorra              "DAE001"   ""               
 3 DAD0012013  2016 DAD001  Europe and Central Asia      Andorra              "DAF001"   ""               
 4 DAE0012012  2012 DAE001  Middle East and North Africa United Arab Emirates "DAD001"   "DAD001"         
 5 DAE0012012  2012 DAE001  Middle East and North Africa United Arab Emirates "DAF001"   "DAF001"         
 6 DAE0012013  2013 DAE001  Middle East and North Africa United Arab Emirates "DAF001"   "DAD001"         
 7 DAE0012013  2013 DAE001  Middle East and North Africa United Arab Emirates "DAF001"   "DAF001"         
 8 DAE0012014  2014 DAE001  Middle East and North Africa United Arab Emirates ""         "DAD001"         
 9 DAE0012014  2014 DAE001  Middle East and North Africa United Arab Emirates ""         "DAF001"         
10 DAE0012015  2015 DAE001  Middle East and North Africa United Arab Emirates "DAF001"   "DAD001"         
11 DAE0012015  2015 DAE001  Middle East and North Africa United Arab Emirates "DAD001"   "DAD001"         
12 DAF0012012  2012 DAF001  Asia Pacific                 Afghanistan          "DAD001"   "DAD001"         
13 DAF0012012  2012 DAF001  Asia Pacific                 Afghanistan          "DAD001"   "DAE001"        

Old answer:

An option using twice separate_rows:

library(tidyr)
library(dplyr)
my_df %>%
  separate_rows(supported1, sep = "\\,") %>%
  separate_rows(received_support1, sep = "\\,")
  • Related