I have data that looks as follows (example data at the bottom):

# A tibble: 40 × 6
   rn    strata    lower upper   direction value        
   <chr> <list>    <chr> <chr>   <chr>     <chr>        
 1 A     <dbl [6]> 0     25      East      0 (replaced) 
 2 A     <dbl [6]> 25    100     East      3 (replaced) 
 3 A     <dbl [6]> 100   500     East      3            
 4 A     <dbl [6]> 500   1000    East      4            
 5 A     <dbl [6]> 1000  1000000 East      5            
 6 A     <dbl [6]> 0     25      North     0 (replaced) 
 7 A     <dbl [6]> 25    100     North     0 (replaced) 
 8 A     <dbl [6]> 100   500     North     1            
 9 A     <dbl [6]> 500   1000    North     28 (replaced)
10 A     <dbl [6]> 1000  1000000 North     2            
# … with 30 more rows

I would like to concatenate all value entries by rn, direction, upper. This can almost be done with the following code:

dat_in_new <- dat %>%
    # One line for each rn-group
    group_by(rn, upper, direction) %>% 
    # Calculate the sum, not taking into account replaced values
    summarise(freq = sum(as.numeric(value), na.rm=TRUE), .groups = 'drop_last') %>% 
    group_modify(~add_row(.,freq = sum(.$value))) %>% 
    group_by(rn) %>%
    summarise(freq = list(freq), .groups = "drop")

# A tibble: 2 × 2
  rn    freq      
  <chr> <list>    
1 A     c(0, 0, 0, 0, 0, 4, 0, 3, 0, 0, 5, 2, 9, 0, 0, 0, 0, 0, 0, 0, 3, 1, 1, 0, 0)
2 B     c(0, 0, 1, 0, 0, 13, 0, 2, 1, 0, 10, 3, 5, 0, 0, 1, 0, 1, 0, 0, 4, 0, 0, 1, 0)

This solution now has the correct sum, because the replaced values should not be added to the sum. However they should be added to the list. I have been trying to separate the two, but I cannot figure it out.

EDIT:

I thought it would maybe be possible to create another value column, say value_string, force value to numeric and keep value_string as strings, summarise both of them, get the sum from value and the values from value_string. But I can't figure out how to write the syntax.

Desired output:

# A tibble: 2 × 2
  rn    freq      
  <chr> <list>    
1 A     c("0 (replaced)", "0 (replaced)", ... )
2 B     c("0 (replaced)", "0 (replaced)", ... )

DATA

library(dplyr)
libraray(tidyr)
dat <- structure(list(rn = c("A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "B", 
"B", "B", "B", "B", 
"B", "B", "B", "B", 
"B", "B", "B", "B", 
"B", "B", "B", "B", 
"B", "B", "B"), strata = list(
    c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 
    1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 
    1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 
    500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 
    25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06
    ), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 
    1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 
    1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 
    500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 
    25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06
    ), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 
    1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 
    1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 
    500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 
    25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06
    ), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 
    1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 
    1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 
    500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 
    25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06
    ), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 
    1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 
    1000, 1e 06)), lower = c("0", "25", "100", "500", "1000", 
"0", "25", "100", "500", "1000", "0", "25", "100", "500", "1000", 
"0", "25", "100", "500", "1000", "0", "25", "100", "500", "1000", 
"0", "25", "100", "500", "1000", "0", "25", "100", "500", "1000", 
"0", "25", "100", "500", "1000"), upper = c("25", "100", "500", 
"1000", "1000000", "25", "100", "500", "1000", "1000000", "25", 
"100", "500", "1000", "1000000", "25", "100", "500", "1000", 
"1000000", "25", "100", "500", "1000", "1000000", "25", "100", 
"500", "1000", "1000000", "25", "100", "500", "1000", "1000000", 
"25", "100", "500", "1000", "1000000"), direction = c("East", 
"East", "East", "East", "East", "North", "North", "North", "North", 
"North", "South", "South", "South", "South", "South", "West", 
"West", "West", "West", "West", "East", "East", "East", "East", 
"East", "North", "North", "North", "North", "North", "South", 
"South", "South", "South", "South", "West", "West", "West", "West", 
"West"), value = c("0 (replaced)", "3 (replaced)", "3", "4", "5", 
"0 (replaced)", "0 (replaced)", "1", "28 (replaced)", "2", "0 (replaced)", 
"2 (replaced)", "1", "3", "9", "0 (replaced)", "1 (replaced)", "9 (replaced)", 
"8 (replaced)", "21 (replaced)", "1", "61 (replaced)", "4", "13", "10", 
"2 (replaced)", "12 (replaced)", "48 (replaced)", "32 (replaced)", "3", 
"1", "1", "76 (replaced)", "2", "5", "0 (replaced)", "4 (replaced)", 
"1", "1", "15 (replaced)")), row.names = c(NA, -40L), class = c("tbl_df", 
"tbl", "data.frame"))

CodePudding user response：

I am not sure, but maybe you are looking for this:

What we do here is simple paste and collapse all!! the values after unnesting:

library(dplyr)
library(tidyr)

dat %>% 
  group_by(rn, upper,direction) %>% 
  summarise(freq = sum(as.numeric(value), na.rm=TRUE), .groups = 'drop_last') %>% 
  group_modify(~add_row(.,freq = sum(.$value))) %>% 
  group_by(rn) %>%
  summarise(freq = list(freq), .groups = "drop") %>% 
  unnest() %>% 
  group_by(rn) %>% 
  mutate(freq = paste0(freq, " (replaced)", collapse = ", ")) %>% 
  slice(1)

rn    freq                                                          
  <chr> <chr>                                                         
1 A     0 (replaced), 0 (replaced), 0 (replaced), 0 (replaced), 0 (re~
2 B     0 (replaced), 0 (replaced), 1 (replaced), 0 (replaced), 0 (re~

CodePudding user response：

Perhaps this helps

library(dplyr)
library(stringr)
out <- dat %>% 
  mutate(value_str = replace(value, str_detect(value, "^[0-9] $"), NA_character_), 
    value = as.numeric(value)) %>%
   group_by(rn, lower, upper) %>% 
   transmute(value = sum(value, na.rm = TRUE), value_str) %>% 
   group_by(rn, lower)  %>%
   group_modify(~add_row(., upper = "Sum", value = sum(.$value))) %>% 
  ungroup %>% 
  mutate(value = coalesce(value_str, as.character(value))) %>% 
  distinct(rn, lower, upper, value) %>% 
  group_by(rn) %>% 
  summarise(value = list(value))

-output

> out$value
[[1]]
 [1] "0 (replaced)"  "0"             "5"             "9 (replaced)"  "20"            "16"            "21 (replaced)" "64"            "3 (replaced)" 
[10] "2 (replaced)"  "1 (replaced)"  "7"             "28 (replaced)" "8 (replaced)"  "28"           

[[2]]
 [1] "2"             "2 (replaced)"  "0 (replaced)"  "8"             "5"             "48 (replaced)" "76 (replaced)" "20"            "18"           
[10] "15 (replaced)" "72"            "61 (replaced)" "12 (replaced)" "1"             "4 (replaced)"  "4"             "16"            "32 (replaced)"
[19] "64"

CodePudding user response：

I eventually figured it out, although it is far from the cleanest approach:

# Only sum values that are not replaced
dat$upper <- as.character(dat$upper)
dat <- dat %>%
    group_by(rn, direction  ) %>% 
    summarise(value = as.character(sum(as.numeric(value), na.rm=TRUE)), .groups = 'drop_last', upper="1000001", strata=strata) %>%                  # get sum of sizes
    bind_rows(dat, .)   
# Remove the duplicate rows     
dat <- unique( dat )

# Convert upper back to numeric for sorting                     
dat$upper <- as.numeric(dat$upper)
dat <- dat %>%
    arrange(rn, direction, upper)

# Create list
dat <- dat %>%
    group_by(rn, strata) %>% 
    summarise(freq = list(value), .groups = 'drop')

EDIT:

Desired output:

Related questions:

DATA