Let's say I have this vector:
v=c(1,1,1,1,2,2,2,3,3,3)
How can I return the indexes of the first two occurrences of each distinct value? I only found this really convoluted way:
> (data.frame(v=v,n=1:length(v))%>%group_by(v)%>%slice_max(v,n=2,with_ties=F))[,2]%>%unlist%>%unname
[1] 1 2 5 6 8 9
And this:
> seen=setNames(numeric(length(unique(v))),unique(v))
> o=c();n=1;for(i in v){x=as.character(i);s=seen[x];if(s<2){o[n]=i;n=n 1};seen[x]=s 1};o
[1] 1 2 5 6 8 9
Edit: The solutions that used data.table::rowid
were the fastest in my benchmark:
bench=function(times,...){
arg=match.call(expand.dots=F)$...
l=length(arg)
out=double(times*l)
rand=sample(rep(1:l,times))
n=1
for(x in arg[rand]){t1=Sys.time();eval.parent(x);out[n]=Sys.time()-t1;n=n 1}
setNames(out,sapply(arg[rand],function(x)unfo(paste(deparse(x),collapse="\n"))))
}
s=function(x,...,ignore.case=F,perl=F,fixed=F,useBytes=F){
a=match.call(expand.dots=F)$...
l=length(a)
for(i in seq(1,l,2))x=gsub(a[[i]],if(i==l)""else a[[i 1]],x,ignore.case=ignore.case,perl=perl,fixed=fixed,useBytes=useBytes)
x
}
unfo=function(x)s(x,"\\{\\n","\\{","\n *\\}","\\}",",\\n",",","\\n",";"," *([[:punct:]] ) *","\\1")
sizes=10^c(3:5)
r=sapply(sizes,function(i){
v=round(i*runif(i))
b=bench(10,
unname(unlist(tapply(seq_along(v),v,head,2))),
c(sapply(split(seq_along(v),v),head,2)),
which(ave(v,v,FUN=seq_along)<3),
which(data.table::rowid(v)<3),
seq_along(v)[data.table::rowid(v)<3],
purrr::map(unique(v),~which(.x==v)[1:2])%>%unlist,
tibble(v)%>%mutate(row=row_number())%>%group_by(v)%>%slice(1:2)%>%pull(row),
{seen=setNames(numeric(length(unique(v))),unique(v));o=c();n=1;for(i in v){x=as.character(i);s=seen[x];if(s<2){o[n]=i;n=n 1};seen[x]=s 1};o}
)
tapply(b,names(b),median)
})
r2=r[order(r[,3]),]
r3=apply(r2,2,function(x)formatC(x,max(0,2-ceiling(log10(min(x)))),format="f"))
r4=apply(rbind(paste0("1e",log10(sizes)),r3),2,function(x)formatC(x,max(nchar(x)),format="s"))
writeLines(apply(cbind(r4,c("",rownames(r2))),1,paste,collapse=" "))
This shows the median time of ten runs in seconds:
0.00014 0.00099 0.012 which(rowid(v)<3)
0.00015 0.00104 0.012 seq_along(v)[rowid(v)<3]
0.00146 0.01473 0.177 which(ave(v,v,FUN=seq_along)<3)
0.00352 0.03635 0.398 c(sapply(split(seq_along(v),v),head,2))
0.00381 0.03804 0.447 unname(unlist(tapply(seq_along(v),v,head,2)))
0.01764 2.43060 3.772 {seen=setNames(numeric(length(unique(v))),unique(v));o=c();n=1;for(i in v){x=as.character(i);s=seen[x];if(s<2){o[n]=i;n=n 1};seen[x]=s 1};o}
0.04189 0.38720 4.074 tibble(v)%>%mutate(row=row_number())%>%group_by(v)%>%;slice(1:2)%>%pull(row)
0.00315 0.35728 31.102 map(unique(v),~which(.x==v)[1:2])%>%unlist
CodePudding user response:
We can use tapply
in base R
, use 'v' as the grouping and the input as sequence of 'v', get the first two with head
, unlist
the list
and unname
it
unname(unlist(tapply(seq_along(v), v, head, 2)))
[1] 1 2 5 6 8 9
Or split
by 'v', get the head
by looping over the list
with sapply
c(sapply(split(seq_along(v), v), head, 2))
[1] 1 2 5 6 8 9
Or slightly compact option with rowid
library(data.table)
seq_along(v)[rowid(v) < 3]
[1] 1 2 5 6 8 9
Or as @Henrik mentioned, use which
directly
which(rowid(id) < 3)
CodePudding user response:
Here is an alternative approach:
library(tibble)
library(dplyr)
my_indices <- tibble(v) %>%
mutate(row = row_number()) %>%
group_by(v) %>%
slice(1:2) %>%
pull(row)
my_indices
[1] 1 2 5 6 8 9
CodePudding user response:
Another possible solution:
library(tidyverse)
map(unique(v), ~ which(.x == v)[1:2]) %>% unlist
#> [1] 1 2 5 6 8 9
CodePudding user response:
You could also do:
which(ave(v, v, FUN = seq_along) < 3)
[1] 1 2 5 6 8 9