Get indexes of up to n first occurrences of each unique value in vector-CodePudding

Let's say I have this vector:

v=c(1,1,1,1,2,2,2,3,3,3)

How can I return the indexes of the first two occurrences of each distinct value? I only found this really convoluted way:

> (data.frame(v=v,n=1:length(v))%>%group_by(v)%>%slice_max(v,n=2,with_ties=F))[,2]%>%unlist%>%unname
[1] 1 2 5 6 8 9

And this:

> seen=setNames(numeric(length(unique(v))),unique(v))
> o=c();n=1;for(i in v){x=as.character(i);s=seen[x];if(s<2){o[n]=i;n=n 1};seen[x]=s 1};o
[1] 1 2 5 6 8 9

Edit: The solutions that used data.table::rowid were the fastest in my benchmark:

bench=function(times,...){
  arg=match.call(expand.dots=F)$...
  l=length(arg)
  out=double(times*l)
  rand=sample(rep(1:l,times))
  n=1
  for(x in arg[rand]){t1=Sys.time();eval.parent(x);out[n]=Sys.time()-t1;n=n 1}
  setNames(out,sapply(arg[rand],function(x)unfo(paste(deparse(x),collapse="\n"))))
}

s=function(x,...,ignore.case=F,perl=F,fixed=F,useBytes=F){
  a=match.call(expand.dots=F)$...
  l=length(a)
  for(i in seq(1,l,2))x=gsub(a[[i]],if(i==l)""else a[[i 1]],x,ignore.case=ignore.case,perl=perl,fixed=fixed,useBytes=useBytes)
  x
}

unfo=function(x)s(x,"\\{\\n","\\{","\n *\\}","\\}",",\\n",",","\\n",";"," *([[:punct:]] ) *","\\1")

sizes=10^c(3:5)
r=sapply(sizes,function(i){
  v=round(i*runif(i))
  b=bench(10,
    unname(unlist(tapply(seq_along(v),v,head,2))),
    c(sapply(split(seq_along(v),v),head,2)),
    which(ave(v,v,FUN=seq_along)<3),
    which(data.table::rowid(v)<3),
    seq_along(v)[data.table::rowid(v)<3],
    purrr::map(unique(v),~which(.x==v)[1:2])%>%unlist,
    tibble(v)%>%mutate(row=row_number())%>%group_by(v)%>%slice(1:2)%>%pull(row),
    {seen=setNames(numeric(length(unique(v))),unique(v));o=c();n=1;for(i in v){x=as.character(i);s=seen[x];if(s<2){o[n]=i;n=n 1};seen[x]=s 1};o}
  )
  tapply(b,names(b),median)
})

r2=r[order(r[,3]),]
r3=apply(r2,2,function(x)formatC(x,max(0,2-ceiling(log10(min(x)))),format="f"))
r4=apply(rbind(paste0("1e",log10(sizes)),r3),2,function(x)formatC(x,max(nchar(x)),format="s"))
writeLines(apply(cbind(r4,c("",rownames(r2))),1,paste,collapse=" "))

This shows the median time of ten runs in seconds:

0.00014 0.00099  0.012 which(rowid(v)<3)
0.00015 0.00104  0.012 seq_along(v)[rowid(v)<3]
0.00146 0.01473  0.177 which(ave(v,v,FUN=seq_along)<3)
0.00352 0.03635  0.398 c(sapply(split(seq_along(v),v),head,2))
0.00381 0.03804  0.447 unname(unlist(tapply(seq_along(v),v,head,2)))
0.01764 2.43060  3.772 {seen=setNames(numeric(length(unique(v))),unique(v));o=c();n=1;for(i in v){x=as.character(i);s=seen[x];if(s<2){o[n]=i;n=n 1};seen[x]=s 1};o}
0.04189 0.38720  4.074 tibble(v)%>%mutate(row=row_number())%>%group_by(v)%>%;slice(1:2)%>%pull(row)
0.00315 0.35728 31.102 map(unique(v),~which(.x==v)[1:2])%>%unlist

CodePudding user response：

We can use tapply in base R, use 'v' as the grouping and the input as sequence of 'v', get the first two with head, unlist the list and unname it

unname(unlist(tapply(seq_along(v), v, head, 2)))
[1] 1 2 5 6 8 9

Or split by 'v', get the head by looping over the list with sapply

c(sapply(split(seq_along(v), v), head, 2))
[1] 1 2 5 6 8 9

Or slightly compact option with rowid

library(data.table)
seq_along(v)[rowid(v) < 3]
[1] 1 2 5 6 8 9

Or as @Henrik mentioned, use which directly

which(rowid(id) < 3)

CodePudding user response：

Here is an alternative approach:

library(tibble)
library(dplyr)

my_indices <- tibble(v) %>% 
  mutate(row = row_number()) %>% 
  group_by(v) %>% 
  slice(1:2) %>% 
  pull(row)

my_indices
[1] 1 2 5 6 8 9

CodePudding user response：

Another possible solution:

library(tidyverse)

map(unique(v), ~ which(.x == v)[1:2]) %>% unlist

#> [1] 1 2 5 6 8 9

CodePudding user response：

You could also do:

which(ave(v, v, FUN = seq_along) < 3)

[1] 1 2 5 6 8 9