How to convert a list of characters that is stored in a single value (per record) into a format that-CodePudding

So I have the following data. In this dataframe I have a username, date, and a list of packages. My goal is to transform that column packages into a format I can analyze.

> print(data_example)
# A tibble: 4 x 3
  username date     packages                                                                   
  <chr>    <chr>    <chr>                                                                      
1 John     1/5/2015 "compiler 4.1.2, magrittr 2.0.1, ellipsis 0.3.2, tools 4.1.2, pillar 1.6.4"
2 Karen    1/5/2015 "compiler 4.1.2, tools 4.1.2\""                                            
3 Mike     1/5/2015 "evaluate 0.14, highr 0.9, httr 1.4.2, pillar 1.6.4, rlang 0.4.12"         
4 Zoe      1/6/2015 "httr 1.4.2, viridisLite 0.4.0, jsonlite 1.7.2, splines 4.1.2"

I tried doing something like this but this doesn't really give me much value.

packages <- as.list(parsed_log$packages)

My goal would be to try doing a "pivot longer" and get an outcome like this? I'm not sure if this is the best way to analyze this kind of data.

username   date      packages
John       1/5/2015  compiler 4.1.2
John       1/5/2015  magrittr 2.0.1
John       1/5/2015  ellipsis 0.3.2
John       1/5/2015  tools 4.1.2
ect...

CodePudding user response：

A solution, based on tidyr::separate:

library(tidyr)

df <- data.frame(
  stringsAsFactors = FALSE,
  username = c("John", "Karen", "Mike", "Zoe"),
  date = c("1/5/2015", "1/5/2015", "1/5/2015", "1/6/2015"),
  packages = c("compiler 4.1.2, magrittr 2.0.1, ellipsis 0.3.2, tools 4.1.2, pillar 1.6.4","compiler 4.1.2, tools 4.1.2\\\"",
               "evaluate 0.14, highr 0.9, httr 1.4.2, pillar 1.6.4, rlang 0.4.12",
               "httr 1.4.2, viridisLite 0.4.0, jsonlite 1.7.2, splines 4.1.2")
)

separate_rows(df, packages, sep = ", ")

#> # A tibble: 16 × 3
#>    username date     packages           
#>    <chr>    <chr>    <chr>              
#>  1 John     1/5/2015 "compiler 4.1.2"   
#>  2 John     1/5/2015 "magrittr 2.0.1"   
#>  3 John     1/5/2015 "ellipsis 0.3.2"   
#>  4 John     1/5/2015 "tools 4.1.2"      
#>  5 John     1/5/2015 "pillar 1.6.4"     
#>  6 Karen    1/5/2015 "compiler 4.1.2"   
#>  7 Karen    1/5/2015 "tools 4.1.2\\\""  
#>  8 Mike     1/5/2015 "evaluate 0.14"    
#>  9 Mike     1/5/2015 "highr 0.9"        
#> 10 Mike     1/5/2015 "httr 1.4.2"       
#> 11 Mike     1/5/2015 "pillar 1.6.4"     
#> 12 Mike     1/5/2015 "rlang 0.4.12"     
#> 13 Zoe      1/6/2015 "httr 1.4.2"       
#> 14 Zoe      1/6/2015 "viridisLite 0.4.0"
#> 15 Zoe      1/6/2015 "jsonlite 1.7.2"   
#> 16 Zoe      1/6/2015 "splines 4.1.2"

CodePudding user response：

One option is to use stringi to split:

s <- strsplit(as.character(df$packages), ',')
data.frame(
  packages = unlist(s),
  date = rep(df$date, length(s)),
  username = rep(df$username, length(s))
)

             packages     date username
1      compiler 4.1.2 1/5/2015     John
2      magrittr 2.0.1 1/5/2015    Karen
3      ellipsis 0.3.2 1/5/2015     Mike
4         tools 4.1.2 1/6/2015      Zoe
5        pillar 1.6.4 1/5/2015     John
6      compiler 4.1.2 1/5/2015    Karen
7      tools 4.1.2\\" 1/5/2015     Mike
8       evaluate 0.14 1/6/2015      Zoe
9           highr 0.9 1/5/2015     John
10         httr 1.4.2 1/5/2015    Karen
11       pillar 1.6.4 1/5/2015     Mike
12       rlang 0.4.12 1/6/2015      Zoe
13         httr 1.4.2 1/5/2015     John
14  viridisLite 0.4.0 1/5/2015    Karen
15     jsonlite 1.7.2 1/5/2015     Mike
16      splines 4.1.2 1/6/2015      Zoe

Another option is to use cSplit from splitstackshape:

library(splitstackshape)
cSplit(df, "packages", ",", direction = "long")

data.table option:

library(data.table)
dt <- as.data.table(df)
dt[, strsplit(as.character(packages), ","), by = c("username", "date")]

Data (Thanks @PaulS for the data)

df <- structure(list(username = c("John", "Karen", "Mike", "Zoe"), 
                     date = c("1/5/2015", "1/5/2015", "1/5/2015", "1/6/2015"), 
                     packages = c("compiler 4.1.2, magrittr 2.0.1, ellipsis 0.3.2, tools 4.1.2, pillar 1.6.4", 
                                  "compiler 4.1.2, tools 4.1.2\\\"", "evaluate 0.14, highr 0.9, httr 1.4.2, pillar 1.6.4, rlang 0.4.12", 
                                  "httr 1.4.2, viridisLite 0.4.0, jsonlite 1.7.2, splines 4.1.2"
                     )), class = "data.frame", row.names = c(NA, -4L))