Imagine you have the following data set:


df = data.frame(ID = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20), gender= c(1,2,1,2,2,2,2,1,1,2,1,2,1,2,2,2,2,1,1,2),
                PID = c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10))

how can I write a code that removes the rows in the df whose gender and PID are the same (see picture). Please imagine that the code is over 1000 rows long (so it should be a solution that automatically searches for the right values to exclude).

enter image description here

CodePudding user response：

base R

df[ave(rep(TRUE, nrow(df)), df[,c("gender","paar")], FUN = function(z) !any(duplicated(z))),]
#    ID gender paar
# 1   1      1    1
# 2   2      2    1
# 3   3      1    2
# 4   4      2    2
# 7   7      2    4
# 8   8      1    4
# 9   9      1    5
# 10 10      2    5
# 11 11      1    6
# 12 12      2    6
# 13 13      1    7
# 14 14      2    7
# 17 17      2    9
# 18 18      1    9
# 19 19      1   10
# 20 20      2   10

dplyr

library(dplyr)
df %>%
  group_by(gender, paar) %>%
  filter(!any(duplicated(cbind(gender, paar)))) %>%
  ungroup()

CodePudding user response：

In base R, we may use subset after removing the observations where the group count for 'gender' and 'paar' are not 1

subset(df, ave(seq_along(gender), gender, paar, FUN = length) == 1)

Or with duplicated

df[!(duplicated(df[-1])|duplicated(df[-1], fromLast = TRUE)),]

-output

   ID gender paar
1   1      1    1
2   2      2    1
3   3      1    2
4   4      2    2
7   7      2    4
8   8      1    4
9   9      1    5
10 10      2    5
11 11      1    6
12 12      2    6
13 13      1    7
14 14      2    7
17 17      2    9
18 18      1    9
19 19      1   10
20 20      2   10

CodePudding user response：

Here is one more: :-)

library(dplyr)
df %>%
  group_by(gender, PID) %>%  
  filter(is.na(ifelse(n()>1, 1, NA)))

     ID gender   PID
   <dbl>  <dbl> <dbl>
 1     1      1     1
 2     2      2     1
 3     3      1     2
 4     4      2     2
 5     7      2     4
 6     8      1     4
 7     9      1     5
 8    10      2     5
 9    11      1     6
10    12      2     6
11    13      1     7
12    14      2     7
13    17      2     9
14    18      1     9
15    19      1    10
16    20      2    10

CodePudding user response：

Another dplyr option could be:

df %>%
 filter(with(rle(paste0(gender, PID)), rep(lengths == 1, lengths)))

   ID gender PID
1   1      1   1
2   2      2   1
3   3      1   2
4   4      2   2
5   7      2   4
6   8      1   4
7   9      1   5
8  10      2   5
9  11      1   6
10 12      2   6
11 13      1   7
12 14      2   7
13 17      2   9
14 18      1   9
15 19      1  10
16 20      2  10

If the duplicated values can occur also between non-consecutive rows:

df %>%
 arrange(gender, PID) %>%
 filter(with(rle(paste0(gender, PID)), rep(lengths == 1, lengths)))

CodePudding user response：

Using aggregate

na.omit(aggregate(. ~ gender   PID, df, function(x) 
  ifelse(length(x) == 1, x, NA)))
   gender PID ID
1       1   1  1
2       2   1  2
3       1   2  3
4       2   2  4
6       1   4  8
7       2   4  7
8       1   5  9
9       2   5 10
10      1   6 11
11      2   6 12
12      1   7 13
13      2   7 14
15      1   9 18
16      2   9 17
17      1  10 19
18      2  10 20

With dplyr

library(dplyr)

df %>% 
  group_by(gender, PID) %>% 
  filter(n() == 1) %>% 
  ungroup()
# A tibble: 16 × 3
      ID gender   PID
   <dbl>  <dbl> <dbl>
 1     1      1     1
 2     2      2     1
 3     3      1     2
 4     4      2     2
 5     7      2     4
 6     8      1     4
 7     9      1     5
 8    10      2     5
 9    11      1     6
10    12      2     6
11    13      1     7
12    14      2     7
13    17      2     9
14    18      1     9
15    19      1    10
16    20      2    10