I have 2 dataframes, df1 contains a groupID and continuous variables like so:
GroupID Var1 Var2 Var3 Var4
1 20.33115 19.59319 0.6384765 0.6772862
1 31.05899 23.14446 0.5796645 0.7273182
2 24.28984 20.99047 0.6425050 0.6865804
2 22.47856 21.36709 0.6690020 0.6368560
3 21.65817 20.99444 0.6829786 0.6461840
3 23.45899 21.57718 0.6655482 0.6473043
And df2 contains cutoff values (ct) for each variable:
Var1ct Var2ct Var3ct Var4ct
22.7811 20.3349 0.7793 0.4294
What I want to do is, for each variable in df1, find the number of rows where the value is greater than the cutoff value in its associated columnn in df2 and return that number for each groupID, so the output would look like this:
GroupID N-Var1 N-Var2 N-Var3 N-Var4
1 62 78 33 99
2 69 25 77 12
3 55 45 27 62
df1 is ~ 2million rows unevenly distributed by GroupID and 30 variable columns I need the count for, I am just looking for a more effecient way than typing out the same function for all 30 variables.
CodePudding user response:
Here's a way in dplyr
:
library(dplyr)
df1 %>%
group_by(GroupID) %>%
summarise(across(everything(), ~ sum(.x > df2[grepl(cur_column(), colnames(df2))][, 1])))
GroupID Var1 Var2 Var3 Var4
<int> <int> <int> <int> <int>
1 1 1 1 0 2
2 2 1 2 0 2
3 3 1 2 0 2
data
df1 <- read.table(header = T, text = "GroupID Var1 Var2 Var3 Var4
1 20.33115 19.59319 0.6384765 0.6772862
1 31.05899 23.14446 0.5796645 0.7273182
2 24.28984 20.99047 0.6425050 0.6865804
2 22.47856 21.36709 0.6690020 0.6368560
3 21.65817 20.99444 0.6829786 0.6461840
3 23.45899 21.57718 0.6655482 0.6473043 ")
df2 <- read.table(header = T, text = "Var1ct Var2ct Var3ct Var4ct
22.7811 20.3349 0.7793 0.4294")
CodePudding user response:
a data.table approach that should scale well..
library(data.table)
# if df1 and dsf2 are not data.table, use
# setDT(df)1; setDT(df2)
# we need similara columnnames in df1 and df2 to easily join
setnames(df2, names(df1)[2:5])
# melt df1 and to long format
df1.long <- melt(df1, id.vars = "GroupID")
df2.long <- melt(df2, measure.vars = names(df2))
# join ct-values
df1.long[df2.long, ct := i.value, on = .(variable)]
# summarise
ans <- df1.long[, sum(value > ct), by = .(GroupID, variable)]
# cast to wide
dcast(ans, GroupID ~ variable, value.var = "V1")
# GroupID Var1 Var2 Var3 Var4
# 1: 1 1 1 0 2
# 2: 2 1 2 0 2
# 3: 3 1 2 0 2
sample data
df1 <- fread("GroupID Var1 Var2 Var3 Var4
1 20.33115 19.59319 0.6384765 0.6772862
1 31.05899 23.14446 0.5796645 0.7273182
2 24.28984 20.99047 0.6425050 0.6865804
2 22.47856 21.36709 0.6690020 0.6368560
3 21.65817 20.99444 0.6829786 0.6461840
3 23.45899 21.57718 0.6655482 0.6473043 ")
df2 <- fread("Var1ct Var2ct Var3ct Var4ct
22.7811 20.3349 0.7793 0.4294")