Home > Blockchain >  Creating labels by group mean
Creating labels by group mean

Time:12-24

I have a dataset as follows -

library(data.table)
dt1 = data.table(A = c(rnorm(1:5, mean = 5, sd = 1), rnorm(1:5, mean = 7, sd = 1), rnorm(1:5, mean = -2, sd = 1)), 
                   group= c(rep(0,5), rep(1,5), rep(2,5)))
> dt1
            A group
 1:  3.276487     0
 2:  4.958663     0
 3:  4.623734     0
 4:  6.163073     0
 5:  3.361670     0
 6:  8.513315     1
 7:  6.793443     1
 8:  6.615550     1
 9:  5.675353     1
10:  7.484761     1
11: -2.089462     2
12: -1.234090     2
13: -2.398878     2
14: -1.794349     2
15: -3.584671     2

I want to add a new column called label so that labels are in ascending order of the group mean of the first column (column A in dt1). Here is the expected output -

> dt1
            A group label
 1:  3.276487     0  1
 2:  4.958663     0  1
 3:  4.623734     0  1
 4:  6.163073     0  1
 5:  3.361670     0  1
 6:  8.513315     1  2
 7:  6.793443     1  2
 8:  6.615550     1  2
 9:  5.675353     1  2
10:  7.484761     1  2
11: -2.089462     2  0
12: -1.234090     2  0
13: -2.398878     2  0
14: -1.794349     2  0
15: -3.584671     2  0

A one-liner solution using data.table is preferred.

Here is the performance of all the data.table solutions. I am accepting Christian's solution due to its ease of understanding, use of data.table library (as requested) and speed.

  microbenchmark::microbenchmark(
    rg255 = dt1 <- dt1[dt1[, mean(A), by=group], on="group"][order(V1), V1 := .GRP, by=group],
    
    maydin = {x <- aggregate(dt1[,1],by=dt1[,2],mean)
    merge(dt1,data.frame(group=x[,-2],label=3-order(-x[,2])),by="group")},
    
    r2evans = dt1[, mu := mean(A), by = .(group)][, label := match(mu, sort(unique(mu))) - 1][, mu := NULL][],
    ThomasIsCoding = dt1[
      dt1[, .(label = mean(A)), group][
        , label := rank(label) - 1
      ],
      on = .(group)
    ],
    Christian = dt1[, label := match(group, dt1[, mean(A), group][order(V1)]$group)-1],
    Sotos = rep(rank(unique(with(dt1, ave(A, group)))), table(dt1$group)) - 1,
    
    times = 100)

Unit: microseconds
           expr      min       lq      mean   median        uq       max neval  cld
          rg255 3621.721 4855.061 5730.3802 5992.227 6522.7175 10119.238   100   c 
         maydin 4716.400 6775.483 7398.4339 7456.865 7817.8770 36566.490   100    d
        r2evans 1921.305 2053.526 2510.4819 2160.812 2508.9465 29398.192   100  b  
 ThomasIsCoding 3252.965 5054.413 5399.1829 5615.391 6062.0480  8690.087   100   c 
      Christian 1825.354 1964.455 2158.2592 2045.222 2407.6425  3971.686   100  b  
          Sotos  607.254  705.982  755.2218  749.612  779.3045  1200.381   100 a 

CodePudding user response:

Here are two possible ways to solve your problem using data.table. .GRP (group counter) and .EACH (group by each i) are explained on data.table manual HERE

### method 1
dt1[dt1[, mean(A), group][order(V1)], label := .GRP-1, by=.EACHI, on="group"]

#             A group label
#  1:  4.750857     0     1
#  2:  5.093418     0     1
#  3:  5.872350     0     1
#  4:  4.748723     0     1
#  5:  4.620098     0     1
#  6:  6.298940     1     2
#  7:  6.676790     1     2
#  8:  7.697083     1     2
#  9:  7.312748     1     2
# 10:  7.358848     1     2
# 11: -2.444162     2     0
# 12: -2.154480     2     0
# 13: -3.213787     2     0
# 14: -3.023242     2     0
# 15: -1.229246     2     0



### Method 2
dt1[, label := match(group, dt1[, mean(A), group][order(V1)]$group)-1]

#             A group label
#  1:  4.750857     0     1
#  2:  5.093418     0     1
#  3:  5.872350     0     1
#  4:  4.748723     0     1
#  5:  4.620098     0     1
#  6:  6.298940     1     2
#  7:  6.676790     1     2
#  8:  7.697083     1     2
#  9:  7.312748     1     2
# 10:  7.358848     1     2
# 11: -2.444162     2     0
# 12: -2.154480     2     0
# 13: -3.213787     2     0
# 14: -3.023242     2     0
# 15: -1.229246     2     0

CodePudding user response:

Here is a base R one-liner,

rep(rank(unique(with(dt1, ave(A, group)))), table(dt1$group)) - 1

#[1] 1 1 1 1 1 2 2 2 2 2 0 0 0 0 0

CodePudding user response:

Another data.table option

dt1[
  dt1[, .(label = mean(A)), group][
    , label := rank(label) - 1
  ],
  on = .(group)
]

gives

            A group label
 1:  6.052347     0     1
 2:  5.922558     0     1
 3:  4.224303     0     1
 4:  4.067353     0     1
 5:  3.801318     0     1
 6:  8.415603     1     2
 7:  7.179845     1     2
 8:  7.056742     1     2
 9:  6.912266     1     2
10:  7.837252     1     2
11: -2.051671     2     0
12: -1.323038     2     0
13: -1.342514     2     0
14: -1.450518     2     0
15: -1.785296     2     0

CodePudding user response:

another base R

with(list(mu = ave(dt1$A, dt1$group)), match(mu, sort(unique(mu)))) - 1
#  [1] 1 1 1 1 1 2 2 2 2 2 0 0 0 0 0

another data.table

(similar to others, no joins required)

dt1[, mu := mean(A), by = .(group)][, label := match(mu, sort(unique(mu))) - 1][, mu := NULL][]
#              A group label
#          <num> <num> <num>
#  1:  6.3709584     0     1
#  2:  4.4353018     0     1
#  3:  5.3631284     0     1
#  4:  5.6328626     0     1
#  5:  5.4042683     0     1
#  6:  6.8938755     1     2
#  7:  8.5115220     1     2
#  8:  6.9053410     1     2
#  9:  9.0184237     1     2
# 10:  6.9372859     1     2
# 11: -0.6951303     2     0
# 12:  0.2866454     2     0
# 13: -3.3888607     2     0
# 14: -2.2787888     2     0
# 15: -2.1333213     2     0

Data

dt1 <- setDT(structure(list(A = c(6.37095844714667, 4.43530182860391, 5.36312841133734, 5.63286260496104, 5.404268323141, 6.89387548390852, 8.51152199743894, 6.9053409615869, 9.01842371387704, 6.93728590094758, -0.695130345776515, 0.286645392701107, -3.38886070111234, -2.27878876681737, -2.13332133639366), group = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), label = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0)), row.names = c(NA, -15L), class = c("data.table", "data.frame")))

CodePudding user response:

If dplyr available,

library(dplyr)

df1 <- dt1 %>%
  group_by(label) %>%
  summarize(new_label = sum(A)) %>%
  ungroup %>%
  mutate(new_label = rank(x)-1)

dt1 %>%
  left_join(df1, by = "label")

             A label x
 1:  5.7925501     0 1
 2:  5.4773940     0 1
 3:  5.7178468     0 1
 4:  3.5393696     0 1
 5:  4.7657958     0 1
 6:  6.2677985     1 2
 7:  5.4450776     1 2
 8:  6.1410713     1 2
 9:  5.4983753     1 2
10:  6.7543268     1 2
11: -1.5883667     2 0
12: -1.0856677     2 0
13: -1.4062993     2 0
14: -1.9452689     2 0
15: -0.2509141     2 0

CodePudding user response:

Another data.table approach - it's getting the group means, merging them on, then sorting by those values and assigning the value of the .GRP special character

dt1 <- dt1[dt1[, mean(A), by=group], on="group"][order(V1), V1 := .GRP, by=group]

CodePudding user response:

Another apporach with Base R

x <- aggregate(dt1[,1],by=dt1[,2],mean)

merge(dt1,data.frame(group=x[,-2],label=3-order(-x[,2])),by="group")

gives,

#     group         A label
#  1:     0  4.602405     1
#  2:     0  5.619589     1
#  3:     0  4.435626     1
#  4:     0  4.164321     1
#  5:     0  5.334557     1
#  6:     1  8.288011     2
#  7:     1  6.741436     2
#  8:     1  6.693326     2
#  9:     1  5.567287     2
# 10:     1  6.056554     2
# 11:     2 -4.322494     0
# 12:     2 -4.085312     0
# 13:     2 -1.747721     0
# 14:     2 -2.473705     0
# 15:     2 -1.176904     0
  • Related