Home > Net >  How do I unpack tuple format in R?
How do I unpack tuple format in R?

Time:03-12

Here is the dataset.

library(data.table)

x <- structure(list(id = c("A", "B" ),
                    segment_stemming = c("[('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]", 
                                          "[('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]" )), 
               row.names = c(NA, -2L), 
               class = c("data.table", "data.frame" ))

x
# id                                                     segment_stemming
# 1:  A               [('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]
# 2:  B [('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]

I would like to split the tuple into rows. Here is my expected outcome.

id             segment_stemming
A              ('Brownie', 'Noun')
A              ('From', 'Josa')
A              ('Pi', 'Noun')
B              ('Dung-caroon-gye', 'Noun')
B              ('in', 'Josa')
B              ('innovation', 'Noun')

I've searched the tuple format using R but cannot find out any clue to make the outcome.

CodePudding user response:

data.table approach

Here is an option using data.table reticulate

library(reticulate)
library(data.table)
setDT(x)[
  ,
  segment_stemming := gsub("(\\(.*?\\))", '\"\\1\"', segment_stemming)
][
  ,
  lapply(.SD, py_eval),
  id
]

which gives

   id            segment_stemming
1:  A         ('Brownie', 'Noun')
2:  A            ('From', 'Josa')
3:  A              ('Pi', 'Noun')
4:  B ('Dung-caroon-gye', 'Noun')
5:  B              ('in', 'Josa')
6:  B      ('innovation', 'Noun')

Another data.table option using strsplit trimws like below

library(data.table)
setDT(x)[
  ,
  .(segment_stemming = trimws(
    unlist(strsplit(segment_stemming, "(?<=\\)),\\s (?=\\()", perl = TRUE)),
    whitespace = "\\[|\\]"
  )),
  id
]

gives

   id            segment_stemming
1:  A         ('Brownie', 'Noun')
2:  A            ('From', 'Josa')
3:  A              ('Pi', 'Noun')
4:  B ('Dung-caroon-gye', 'Noun')
5:  B              ('in', 'Josa')
6:  B      ('innovation', 'Noun')

base R

Some base R options should work as well

with(
  x,
  setNames(
    rev(
      stack(
        tapply(
          segment_stemming,
          id,
          function(v) {
            trimws(
              unlist(strsplit(v, "(?<=\\)),\\s (?=\\()", perl = TRUE)),
              whitespace = "\\[|\\]"
            )
          }
        )
      )
    ),
    names(x)
  )
)

or

with(
  x,
  setNames(
    rev(
      stack(
        setNames(
          regmatches(segment_stemming, gregexpr("\\(.*?\\)", segment_stemming)),
          id
        )
      )
    ),
    names(x)
  )
)

CodePudding user response:

Here's a way using separate_rows:

library(tidyverse)

x %>% 
  mutate(segment_stemming = gsub("\\[|\\]", "", segment_stemming)) %>% 
  separate_rows(segment_stemming, sep = ",\\s*(?![^()]*\\))")

# A tibble: 6 x 2
  id    segment_stemming           
  <chr> <chr>                      
1 A     ('Brownie', 'Noun')        
2 A     ('From', 'Josa')           
3 A     ('Pi', 'Noun')             
4 B     ('Dung-caroon-gye', 'Noun')
5 B     ('in', 'Josa')             
6 B     ('innovation', 'Noun') 

One way to get a better result, with some manipulation (unnest_wider is not necessary).

x %>% 
  mutate(segment_stemming = gsub("\\[|\\]", "", segment_stemming)) %>% 
  separate_rows(segment_stemming, sep = ",\\s*(?![^()]*\\))") %>% 
  mutate(segment_stemming = segment_stemming %>% 
           str_remove_all("[()',]") %>% 
           str_split(" ")) %>% 
  unnest_wider(segment_stemming)

# A tibble: 6 x 3
  id    ...1            ...2 
  <chr> <chr>           <chr>
1 A     Brownie         Noun 
2 A     From            Josa 
3 A     Pi              Noun 
4 B     Dung-caroon-gye Noun 
5 B     in              Josa 
6 B     innovation      Noun 

CodePudding user response:

Here is another potential option:

library(data.table)

dt <- structure(list(id = c("A", "B" ), segement_stemming = c("[('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]", "[('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]" )), row.names = c(NA, -2L), class = c("data.table", "data.frame" ))

dt2 <- dt[, c(segement_stemming = strsplit(segement_stemming, "(?<=[^']),", perl = TRUE)), by = id]
dt2[, names(dt2) := lapply(.SD, function(x) gsub("\\[|\\]", "", x))]
dt2
#>    id           segement_stemming
#> 1:  A         ('Brownie', 'Noun')
#> 2:  A            ('From', 'Josa')
#> 3:  A              ('Pi', 'Noun')
#> 4:  B ('Dung-caroon-gye', 'Noun')
#> 5:  B              ('in', 'Josa')
#> 6:  B      ('innovation', 'Noun')

Created on 2022-03-11 by the reprex package (v2.0.1)

CodePudding user response:

x[,.(segment_stemming = unlist(str_extract_all(segment_stemming, "\\(.*?\\)"))), by = id]

or you can use tidyr::unnest. This way there is only one call to str_extract_all:

x[, segment_stemming := str_extract_all(segment_stemming, "\\(.*?\\)")]
unnest(x, segment_stemming)

CodePudding user response:

A data.table way would be such as:

library(stringr)

x [, segment_stemming:=gsub("\\[|\\]", "", segment_stemming, perl = T)] #remove brackets
x [, parsed := str_split(segment_stemming, "\\),")]                     # split string
out <- x[, .(unlist(parsed, recursive = F)), by = .(id)]                # unlist elements
out [ , V1  := gsub("\\)?$",")", V1)][]                                 # adjust format

       id                          V1
   <char>                      <char>
1:      A         ('Brownie', 'Noun')
2:      A            ('From', 'Josa')
3:      A              ('Pi', 'Noun')
4:      B ('Dung-caroon-gye', 'Noun')
5:      B              ('in', 'Josa')
6:      B      ('innovation', 'Noun')
  • Related