I have a dataset as follows -
library(data.table)
dt1 = data.table(A = c(rnorm(1:5, mean = 5, sd = 1), rnorm(1:5, mean = 7, sd = 1), rnorm(1:5, mean = -2, sd = 1)),
group= c(rep(0,5), rep(1,5), rep(2,5)))
> dt1
A group
1: 3.276487 0
2: 4.958663 0
3: 4.623734 0
4: 6.163073 0
5: 3.361670 0
6: 8.513315 1
7: 6.793443 1
8: 6.615550 1
9: 5.675353 1
10: 7.484761 1
11: -2.089462 2
12: -1.234090 2
13: -2.398878 2
14: -1.794349 2
15: -3.584671 2
I want to add a new column called label
so that labels are in ascending order of the group mean of the first column (column A in dt1
). Here is the expected output -
> dt1
A group label
1: 3.276487 0 1
2: 4.958663 0 1
3: 4.623734 0 1
4: 6.163073 0 1
5: 3.361670 0 1
6: 8.513315 1 2
7: 6.793443 1 2
8: 6.615550 1 2
9: 5.675353 1 2
10: 7.484761 1 2
11: -2.089462 2 0
12: -1.234090 2 0
13: -2.398878 2 0
14: -1.794349 2 0
15: -3.584671 2 0
A one-liner solution using data.table
is preferred.
Here is the performance of all the data.table
solutions. I am accepting Christian's solution due to its ease of understanding, use of data.table
library (as requested) and speed.
microbenchmark::microbenchmark(
rg255 = dt1 <- dt1[dt1[, mean(A), by=group], on="group"][order(V1), V1 := .GRP, by=group],
maydin = {x <- aggregate(dt1[,1],by=dt1[,2],mean)
merge(dt1,data.frame(group=x[,-2],label=3-order(-x[,2])),by="group")},
r2evans = dt1[, mu := mean(A), by = .(group)][, label := match(mu, sort(unique(mu))) - 1][, mu := NULL][],
ThomasIsCoding = dt1[
dt1[, .(label = mean(A)), group][
, label := rank(label) - 1
],
on = .(group)
],
Christian = dt1[, label := match(group, dt1[, mean(A), group][order(V1)]$group)-1],
Sotos = rep(rank(unique(with(dt1, ave(A, group)))), table(dt1$group)) - 1,
times = 100)
Unit: microseconds
expr min lq mean median uq max neval cld
rg255 3621.721 4855.061 5730.3802 5992.227 6522.7175 10119.238 100 c
maydin 4716.400 6775.483 7398.4339 7456.865 7817.8770 36566.490 100 d
r2evans 1921.305 2053.526 2510.4819 2160.812 2508.9465 29398.192 100 b
ThomasIsCoding 3252.965 5054.413 5399.1829 5615.391 6062.0480 8690.087 100 c
Christian 1825.354 1964.455 2158.2592 2045.222 2407.6425 3971.686 100 b
Sotos 607.254 705.982 755.2218 749.612 779.3045 1200.381 100 a
CodePudding user response:
Here are two possible ways to solve your problem using data.table.
.GRP
(group counter) and .EACH
(group by each i
) are explained on data.table manual HERE
### method 1
dt1[dt1[, mean(A), group][order(V1)], label := .GRP-1, by=.EACHI, on="group"]
# A group label
# 1: 4.750857 0 1
# 2: 5.093418 0 1
# 3: 5.872350 0 1
# 4: 4.748723 0 1
# 5: 4.620098 0 1
# 6: 6.298940 1 2
# 7: 6.676790 1 2
# 8: 7.697083 1 2
# 9: 7.312748 1 2
# 10: 7.358848 1 2
# 11: -2.444162 2 0
# 12: -2.154480 2 0
# 13: -3.213787 2 0
# 14: -3.023242 2 0
# 15: -1.229246 2 0
### Method 2
dt1[, label := match(group, dt1[, mean(A), group][order(V1)]$group)-1]
# A group label
# 1: 4.750857 0 1
# 2: 5.093418 0 1
# 3: 5.872350 0 1
# 4: 4.748723 0 1
# 5: 4.620098 0 1
# 6: 6.298940 1 2
# 7: 6.676790 1 2
# 8: 7.697083 1 2
# 9: 7.312748 1 2
# 10: 7.358848 1 2
# 11: -2.444162 2 0
# 12: -2.154480 2 0
# 13: -3.213787 2 0
# 14: -3.023242 2 0
# 15: -1.229246 2 0
CodePudding user response:
Here is a base R one-liner,
rep(rank(unique(with(dt1, ave(A, group)))), table(dt1$group)) - 1
#[1] 1 1 1 1 1 2 2 2 2 2 0 0 0 0 0
CodePudding user response:
Another data.table
option
dt1[
dt1[, .(label = mean(A)), group][
, label := rank(label) - 1
],
on = .(group)
]
gives
A group label
1: 6.052347 0 1
2: 5.922558 0 1
3: 4.224303 0 1
4: 4.067353 0 1
5: 3.801318 0 1
6: 8.415603 1 2
7: 7.179845 1 2
8: 7.056742 1 2
9: 6.912266 1 2
10: 7.837252 1 2
11: -2.051671 2 0
12: -1.323038 2 0
13: -1.342514 2 0
14: -1.450518 2 0
15: -1.785296 2 0
CodePudding user response:
another base R
with(list(mu = ave(dt1$A, dt1$group)), match(mu, sort(unique(mu)))) - 1
# [1] 1 1 1 1 1 2 2 2 2 2 0 0 0 0 0
another data.table
(similar to others, no joins required)
dt1[, mu := mean(A), by = .(group)][, label := match(mu, sort(unique(mu))) - 1][, mu := NULL][]
# A group label
# <num> <num> <num>
# 1: 6.3709584 0 1
# 2: 4.4353018 0 1
# 3: 5.3631284 0 1
# 4: 5.6328626 0 1
# 5: 5.4042683 0 1
# 6: 6.8938755 1 2
# 7: 8.5115220 1 2
# 8: 6.9053410 1 2
# 9: 9.0184237 1 2
# 10: 6.9372859 1 2
# 11: -0.6951303 2 0
# 12: 0.2866454 2 0
# 13: -3.3888607 2 0
# 14: -2.2787888 2 0
# 15: -2.1333213 2 0
Data
dt1 <- setDT(structure(list(A = c(6.37095844714667, 4.43530182860391, 5.36312841133734, 5.63286260496104, 5.404268323141, 6.89387548390852, 8.51152199743894, 6.9053409615869, 9.01842371387704, 6.93728590094758, -0.695130345776515, 0.286645392701107, -3.38886070111234, -2.27878876681737, -2.13332133639366), group = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), label = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0)), row.names = c(NA, -15L), class = c("data.table", "data.frame")))
CodePudding user response:
If dplyr
available,
library(dplyr)
df1 <- dt1 %>%
group_by(label) %>%
summarize(new_label = sum(A)) %>%
ungroup %>%
mutate(new_label = rank(x)-1)
dt1 %>%
left_join(df1, by = "label")
A label x
1: 5.7925501 0 1
2: 5.4773940 0 1
3: 5.7178468 0 1
4: 3.5393696 0 1
5: 4.7657958 0 1
6: 6.2677985 1 2
7: 5.4450776 1 2
8: 6.1410713 1 2
9: 5.4983753 1 2
10: 6.7543268 1 2
11: -1.5883667 2 0
12: -1.0856677 2 0
13: -1.4062993 2 0
14: -1.9452689 2 0
15: -0.2509141 2 0
CodePudding user response:
Another data.table approach - it's getting the group means, merging them on, then sorting by those values and assigning the value of the .GRP
special character
dt1 <- dt1[dt1[, mean(A), by=group], on="group"][order(V1), V1 := .GRP, by=group]
CodePudding user response:
Another apporach with Base R
x <- aggregate(dt1[,1],by=dt1[,2],mean)
merge(dt1,data.frame(group=x[,-2],label=3-order(-x[,2])),by="group")
gives,
# group A label
# 1: 0 4.602405 1
# 2: 0 5.619589 1
# 3: 0 4.435626 1
# 4: 0 4.164321 1
# 5: 0 5.334557 1
# 6: 1 8.288011 2
# 7: 1 6.741436 2
# 8: 1 6.693326 2
# 9: 1 5.567287 2
# 10: 1 6.056554 2
# 11: 2 -4.322494 0
# 12: 2 -4.085312 0
# 13: 2 -1.747721 0
# 14: 2 -2.473705 0
# 15: 2 -1.176904 0