Home > Back-end >  Transform a data frame with inconsistent group widths into one with equally large groups
Transform a data frame with inconsistent group widths into one with equally large groups

Time:04-24

My Goal is to transform something like this:

df1 <- data.frame(
  value1 = c(100, 100, 100, 100, 100, 100, 100),
  #  value2=c(a, b, c, d, e, f, g),
  startgroup = c(1, 101, 351, 356, 401, 451, 451),
  endgroup = c(100, 350, 355, 400, 450, 450, 500),
  groupwidth = c(100, 250, 5, 40, 50, 0 , 50)
)

into something like this:

df2 <- data.frame(
  value1 = c(100, 40, 40, 220, 300),
  #  value2=c(a, b*.4, b*.4, b*.2 c d, d e f),
  startgroup = c(1, 101, 201, 301, 401),
  endgroup = c(100, 200, 300, 400, 500),
  groupwidth = c(100, 100, 100, 100, 100)
)

I already managed to do it with a for-loop, but it somehow takes about 5-10 minutes per variable. No idea why tough. and I am certain, that there is a simple way to achieve it.

CodePudding user response:

library(tidyverse)
df1 %>%
  filter(groupwidth>0) %>%
  rowwise() %>%
  mutate(gr = list(c(rep(100, groupwidth %/0), groupwidth %0)),
         ln = length(gr)) %>%
  unnest(gr) %>%
  group_by(st=cumsum(gr-1) %/0) %>%
  summarise(val = sum(value1/groupwidth * gr), 
            startgroup = st[1] * 100   1, 
            endgroup = startgroup   99, 
            groupwidth = 100)

# A tibble: 5 x 5
     st   val startgroup endgroup groupwidth
  <dbl> <dbl>      <dbl>    <dbl>      <dbl>
1     0   100          1      100        100
2     1    40        101      200        100
3     2    40        201      300        100
4     3   220        301      400        100
5     4   200        401      500        100

CodePudding user response:

In case anyone wonders, I found a working solution, that works (on the real data) in less than 5 seconds. The solution ended up being a for loop as well.

df1 <- data.frame(
  value1 = c(100, 100, 100, 100, 100, 100, 100),
  #  value2=c(a, b, c, d, e, f, g),
  startgroup = c(1, 101, 351, 356, 401, 451, 451),
  endgroup = c(100, 350, 355, 400, 450, 450, 500),
  groupwidth = c(100, 250, 5, 40, 50, 0 , 50)) %>% 
  mutate(startgroup = startgroup - 1)


rm <- max(df1$endgroup)%/0 1

for (i in 1:rm){
  df1 <- df1 %>% 
    mutate(
      a = 0,
      a = ifelse(startgroup <= 100*i-100 & endgroup > 100*i, 
        a   value1/(endgroup-startgroup)*100, 
        a),
      a = ifelse(startgroup <= 100*i-100 & endgroup <= 100*i & endgroup > 100*i-100,  
        a   value1/(endgroup-startgroup)*(endgroup - (i-1)*100),
        a),
      a = ifelse(startgroup > 100*i-100 & endgroup <= 100*i,  
        a   value1,
        a),
      a = ifelse(startgroup > 100*i-100 & startgroup <= 100*i & endgroup > 100*i,  
        a   value1/(endgroup-startgroup)*(i*100 - startgroup),
        a),
      !!paste0(i*100) := a
    )
}
df1 <- df1 %>% 
    pivot_longer(
    cols = contains("00"),
    names_to = "upper_bound",
    values_to = "value",
    values_drop_na = TRUE) %>% 
  group_by(upper_bound) %>% 
  mutate(upper_bound = as.integer(upper_bound)) %>% 
  summarize(
    value = sum(value)
  )
  • Related