Get the averages of amount by providers and state with few lines of code in R-CodePudding

I have a dataframe like this

prov_id <- c(599,599,599,599,599,599,599,699,699,699,699,699,699,699,699)
mbr_id <- c(100,101,102,103,103,104,105,200,201,201,202,203,203,204,205)
prov_state <- c("CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA")
amount <- c(3,5,2,28,12,17,10,6,33,31,161,24,22,12,17)
df.sample <- data.frame(prov_id,mbr_id,prov_state,amount,stringsAsFactors=FALSE)

I am trying to compute the averages of amounts by provider and state this way,

library(tidyverse) 
   
# get the member counts for each provider by state
df.sample.memcnt <- 
  df.sample %>%
  select (prov_id, prov_state, mbr_id) %>%
  distinct(prov_id, prov_state, mbr_id) %>%
  group_by(prov_id, prov_state) %>%
  tally(sort=T) %>%
  ungroup() %>% 
  rename(mem_cnt_pvdr = n)

# get the provider counts for each state
df.sample.pvdrcnt <- 
  df.sample %>%
  select (prov_id, prov_state) %>%
  distinct(prov_id, prov_state) %>%
  group_by(prov_state) %>%
  tally(sort=T) %>%
  ungroup() %>% 
  rename(pvdr_cnt_state = n)

# get the mean total amount of providers
df.sample.pvdr <- 
  df.sample %>%
  select (prov_id,prov_state,amount) %>%
  group_by(prov_id,prov_state) %>%
  summarise(total_amt = sum(as.numeric(amount))) %>%
  ungroup() %>%
  inner_join(df.sample.memcnt, by = c("prov_id","prov_state")) %>%
  mutate(mean_total_amt_pvdr = 
           round((total_amt / mem_cnt_pvdr),2)) %>%
  select(-total_amt)

# get the mean total amount of the state
df.sample.state <- 
  df.sample.pvdr %>%
  group_by(prov_state) %>%
  summarise(total_amt_state = sum(as.numeric(mean_total_amt_pvdr)),
            mem_cnt_state = sum(mem_cnt_pvdr)) %>%
  ungroup() %>%
  inner_join(df.sample.pvdrcnt, by = c("prov_state")) %>%
  mutate(mean_total_amt_state = 
           round((total_amt_state / pvdr_cnt_state),2)) %>%
  select(-total_amt_state)

# merge provider df with state df
df.final <- df.sample.pvdr %>%
  inner_join(df.sample.state)

While I get the output I need, I feel this is very inefficient.

Desired output

prov_id prov_state mem_cnt_pvdr mean_total_amt_pvdr mem_cnt_state pvdr_cnt_state mean_total_amt_state
    599   CA                  6                12.8            12              2                 31.9
    699   CA                  6                51              12              2                 31.9

Is there a way to get the desired output with a few lines of code? I really appreciate it.

CodePudding user response：

A bit shorter with data.table and uniqueN:

library(data.table)

setDT(df.sample)

df.sample.state.prov<- df.sample[,{mem_cnt_pvdr=uniqueN(mbr_id );
                                   mean_total_amt_pvdr=round(sum(as.numeric(amount))/mem_cnt_pvdr,2); 
                                   .(mem_cnt_pvdr,mean_total_amt_pvdr)},by=.(prov_state,prov_id)]

df.sample.state <- df.sample.state.prov[,.(pvdr_cnt_state=uniqueN(prov_id ),
                                           total_amt_state=sum(as.numeric(mean_total_amt_pvdr)),
                                           mem_cnt_state=sum(mem_cnt_pvdr)),by=.(prov_state)]

df.sample.state[df.sample.state.prov,.(prov_id,
                                       prov_state,
                                       mem_cnt_pvdr,
                                       mean_total_amt_pvdr,
                                       mem_cnt_state,
                                       pvdr_cnt_state,
                                       mean_total_amt_state=total_amt_state/pvdr_cnt_state)  ,on=.(prov_state)]

   prov_id prov_state mem_cnt_pvdr mean_total_amt_pvdr mem_cnt_state pvdr_cnt_state mean_total_amt_state
1:     599         CA            6               12.83            12              2               31.915
2:     699         CA            6               51.00            12              2               31.915