I have data that looks as follows (example data at the bottom):
# A tibble: 40 × 6
rn strata lower upper direction value
<chr> <list> <chr> <chr> <chr> <chr>
1 A <dbl [6]> 0 25 East 0 (replaced)
2 A <dbl [6]> 25 100 East 3 (replaced)
3 A <dbl [6]> 100 500 East 3
4 A <dbl [6]> 500 1000 East 4
5 A <dbl [6]> 1000 1000000 East 5
6 A <dbl [6]> 0 25 North 0 (replaced)
7 A <dbl [6]> 25 100 North 0 (replaced)
8 A <dbl [6]> 100 500 North 1
9 A <dbl [6]> 500 1000 North 28 (replaced)
10 A <dbl [6]> 1000 1000000 North 2
# … with 30 more rows
I would like to concatenate all value
entries by rn, direction, upper
. This can almost be done with the following code:
dat_in_new <- dat %>%
# One line for each rn-group
group_by(rn, upper, direction) %>%
# Calculate the sum, not taking into account replaced values
summarise(freq = sum(as.numeric(value), na.rm=TRUE), .groups = 'drop_last') %>%
group_modify(~add_row(.,freq = sum(.$value))) %>%
group_by(rn) %>%
summarise(freq = list(freq), .groups = "drop")
# A tibble: 2 × 2
rn freq
<chr> <list>
1 A c(0, 0, 0, 0, 0, 4, 0, 3, 0, 0, 5, 2, 9, 0, 0, 0, 0, 0, 0, 0, 3, 1, 1, 0, 0)
2 B c(0, 0, 1, 0, 0, 13, 0, 2, 1, 0, 10, 3, 5, 0, 0, 1, 0, 1, 0, 0, 4, 0, 0, 1, 0)
This solution now has the correct sum, because the replaced
values should not be added to the sum. However they should be added to the list
. I have been trying to separate the two, but I cannot figure it out.
EDIT:
I thought it would maybe be possible to create another value
column, say value_string
, force value
to numeric and keep value_string
as strings, summarise both of them, get the sum from value
and the values from value_string
. But I can't figure out how to write the syntax.
Desired output:
# A tibble: 2 × 2
rn freq
<chr> <list>
1 A c("0 (replaced)", "0 (replaced)", ... )
2 B c("0 (replaced)", "0 (replaced)", ... )
Related questions:
Make a list out of frequencies, concatenating categories to that list
Using a column, with lists of values, to specify from which columns to create another list of values
DATA
library(dplyr)
libraray(tidyr)
dat <- structure(list(rn = c("A", "A", "A", "A",
"A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "B",
"B", "B", "B", "B",
"B", "B", "B", "B",
"B", "B", "B", "B",
"B", "B", "B", "B",
"B", "B", "B"), strata = list(
c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000,
1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500,
1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100,
500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0,
25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06
), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000,
1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500,
1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100,
500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0,
25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06
), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000,
1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500,
1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100,
500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0,
25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06
), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000,
1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500,
1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100,
500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0,
25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000, 1e 06
), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500, 1000,
1e 06), c(0, 25, 100, 500, 1000, 1e 06), c(0, 25, 100, 500,
1000, 1e 06)), lower = c("0", "25", "100", "500", "1000",
"0", "25", "100", "500", "1000", "0", "25", "100", "500", "1000",
"0", "25", "100", "500", "1000", "0", "25", "100", "500", "1000",
"0", "25", "100", "500", "1000", "0", "25", "100", "500", "1000",
"0", "25", "100", "500", "1000"), upper = c("25", "100", "500",
"1000", "1000000", "25", "100", "500", "1000", "1000000", "25",
"100", "500", "1000", "1000000", "25", "100", "500", "1000",
"1000000", "25", "100", "500", "1000", "1000000", "25", "100",
"500", "1000", "1000000", "25", "100", "500", "1000", "1000000",
"25", "100", "500", "1000", "1000000"), direction = c("East",
"East", "East", "East", "East", "North", "North", "North", "North",
"North", "South", "South", "South", "South", "South", "West",
"West", "West", "West", "West", "East", "East", "East", "East",
"East", "North", "North", "North", "North", "North", "South",
"South", "South", "South", "South", "West", "West", "West", "West",
"West"), value = c("0 (replaced)", "3 (replaced)", "3", "4", "5",
"0 (replaced)", "0 (replaced)", "1", "28 (replaced)", "2", "0 (replaced)",
"2 (replaced)", "1", "3", "9", "0 (replaced)", "1 (replaced)", "9 (replaced)",
"8 (replaced)", "21 (replaced)", "1", "61 (replaced)", "4", "13", "10",
"2 (replaced)", "12 (replaced)", "48 (replaced)", "32 (replaced)", "3",
"1", "1", "76 (replaced)", "2", "5", "0 (replaced)", "4 (replaced)",
"1", "1", "15 (replaced)")), row.names = c(NA, -40L), class = c("tbl_df",
"tbl", "data.frame"))
CodePudding user response:
I am not sure, but maybe you are looking for this:
What we do here is simple paste and collapse all!! the values after unnesting:
library(dplyr)
library(tidyr)
dat %>%
group_by(rn, upper,direction) %>%
summarise(freq = sum(as.numeric(value), na.rm=TRUE), .groups = 'drop_last') %>%
group_modify(~add_row(.,freq = sum(.$value))) %>%
group_by(rn) %>%
summarise(freq = list(freq), .groups = "drop") %>%
unnest() %>%
group_by(rn) %>%
mutate(freq = paste0(freq, " (replaced)", collapse = ", ")) %>%
slice(1)
rn freq
<chr> <chr>
1 A 0 (replaced), 0 (replaced), 0 (replaced), 0 (replaced), 0 (re~
2 B 0 (replaced), 0 (replaced), 1 (replaced), 0 (replaced), 0 (re~
CodePudding user response:
Perhaps this helps
library(dplyr)
library(stringr)
out <- dat %>%
mutate(value_str = replace(value, str_detect(value, "^[0-9] $"), NA_character_),
value = as.numeric(value)) %>%
group_by(rn, lower, upper) %>%
transmute(value = sum(value, na.rm = TRUE), value_str) %>%
group_by(rn, lower) %>%
group_modify(~add_row(., upper = "Sum", value = sum(.$value))) %>%
ungroup %>%
mutate(value = coalesce(value_str, as.character(value))) %>%
distinct(rn, lower, upper, value) %>%
group_by(rn) %>%
summarise(value = list(value))
-output
> out$value
[[1]]
[1] "0 (replaced)" "0" "5" "9 (replaced)" "20" "16" "21 (replaced)" "64" "3 (replaced)"
[10] "2 (replaced)" "1 (replaced)" "7" "28 (replaced)" "8 (replaced)" "28"
[[2]]
[1] "2" "2 (replaced)" "0 (replaced)" "8" "5" "48 (replaced)" "76 (replaced)" "20" "18"
[10] "15 (replaced)" "72" "61 (replaced)" "12 (replaced)" "1" "4 (replaced)" "4" "16" "32 (replaced)"
[19] "64"
CodePudding user response:
I eventually figured it out, although it is far from the cleanest approach:
# Only sum values that are not replaced
dat$upper <- as.character(dat$upper)
dat <- dat %>%
group_by(rn, direction ) %>%
summarise(value = as.character(sum(as.numeric(value), na.rm=TRUE)), .groups = 'drop_last', upper="1000001", strata=strata) %>% # get sum of sizes
bind_rows(dat, .)
# Remove the duplicate rows
dat <- unique( dat )
# Convert upper back to numeric for sorting
dat$upper <- as.numeric(dat$upper)
dat <- dat %>%
arrange(rn, direction, upper)
# Create list
dat <- dat %>%
group_by(rn, strata) %>%
summarise(freq = list(value), .groups = 'drop')