I have a dataframe like this
prov_id <- c(599,599,599,599,599,599,599,699,699,699,699,699,699,699,699)
mbr_id <- c(100,101,102,103,103,104,105,200,201,201,202,203,203,204,205)
prov_state <- c("CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA")
amount <- c(3,5,2,28,12,17,10,6,33,31,161,24,22,12,17)
df.sample <- data.frame(prov_id,mbr_id,prov_state,amount,stringsAsFactors=FALSE)
I am trying to compute the averages of amounts by provider and state this way,
library(tidyverse)
# get the member counts for each provider by state
df.sample.memcnt <-
df.sample %>%
select (prov_id, prov_state, mbr_id) %>%
distinct(prov_id, prov_state, mbr_id) %>%
group_by(prov_id, prov_state) %>%
tally(sort=T) %>%
ungroup() %>%
rename(mem_cnt_pvdr = n)
# get the provider counts for each state
df.sample.pvdrcnt <-
df.sample %>%
select (prov_id, prov_state) %>%
distinct(prov_id, prov_state) %>%
group_by(prov_state) %>%
tally(sort=T) %>%
ungroup() %>%
rename(pvdr_cnt_state = n)
# get the mean total amount of providers
df.sample.pvdr <-
df.sample %>%
select (prov_id,prov_state,amount) %>%
group_by(prov_id,prov_state) %>%
summarise(total_amt = sum(as.numeric(amount))) %>%
ungroup() %>%
inner_join(df.sample.memcnt, by = c("prov_id","prov_state")) %>%
mutate(mean_total_amt_pvdr =
round((total_amt / mem_cnt_pvdr),2)) %>%
select(-total_amt)
# get the mean total amount of the state
df.sample.state <-
df.sample.pvdr %>%
group_by(prov_state) %>%
summarise(total_amt_state = sum(as.numeric(mean_total_amt_pvdr)),
mem_cnt_state = sum(mem_cnt_pvdr)) %>%
ungroup() %>%
inner_join(df.sample.pvdrcnt, by = c("prov_state")) %>%
mutate(mean_total_amt_state =
round((total_amt_state / pvdr_cnt_state),2)) %>%
select(-total_amt_state)
# merge provider df with state df
df.final <- df.sample.pvdr %>%
inner_join(df.sample.state)
While I get the output I need, I feel this is very inefficient.
Desired output
prov_id prov_state mem_cnt_pvdr mean_total_amt_pvdr mem_cnt_state pvdr_cnt_state mean_total_amt_state
599 CA 6 12.8 12 2 31.9
699 CA 6 51 12 2 31.9
Is there a way to get the desired output with a few lines of code? I really appreciate it.
CodePudding user response:
A bit shorter with data.table
and uniqueN
:
library(data.table)
setDT(df.sample)
df.sample.state.prov<- df.sample[,{mem_cnt_pvdr=uniqueN(mbr_id );
mean_total_amt_pvdr=round(sum(as.numeric(amount))/mem_cnt_pvdr,2);
.(mem_cnt_pvdr,mean_total_amt_pvdr)},by=.(prov_state,prov_id)]
df.sample.state <- df.sample.state.prov[,.(pvdr_cnt_state=uniqueN(prov_id ),
total_amt_state=sum(as.numeric(mean_total_amt_pvdr)),
mem_cnt_state=sum(mem_cnt_pvdr)),by=.(prov_state)]
df.sample.state[df.sample.state.prov,.(prov_id,
prov_state,
mem_cnt_pvdr,
mean_total_amt_pvdr,
mem_cnt_state,
pvdr_cnt_state,
mean_total_amt_state=total_amt_state/pvdr_cnt_state) ,on=.(prov_state)]
prov_id prov_state mem_cnt_pvdr mean_total_amt_pvdr mem_cnt_state pvdr_cnt_state mean_total_amt_state
1: 599 CA 6 12.83 12 2 31.915
2: 699 CA 6 51.00 12 2 31.915