Home > OS >  R filter data that appear more than once but do not contain certain string
R filter data that appear more than once but do not contain certain string

Time:05-03

I have a data frame as below. I want to first group the data frame by ID and Region, then filter the data frame by Content. I want to keep ID-Region pair that only have both 'High' and 'Mid' in the Content column.

My current data frame:

structure(list(ID = c("a", "a", "a", "a", "a", "b", "b", "c", 
"c", "c", "c", "d", "d", "d"), Content = c("High", "Mid", "Low", 
"High", "Mid", "High", "Mid", "High", "Mid", "High", "Mid", "High", 
"Mid", "Low"), Count = c("3", "6", "2", "3", "7", "3", "5", "8", 
"5", "4", "2", "4", "9", "5"), Region = c("East", "East", "East", 
"West", "West", "East", "East", "East", "East", "West", "West", 
"West", "West", "West")), class = "data.frame", row.names = c(NA, 
-14L))

My expect data frame:

structure(list(ID = c("a", "a", "b", "b", "c", "c", "c", "c"), 
    Content = c("High", "Mid", "High", "Mid", "High", "Mid", 
    "High", "Mid"), Count = c("3", "7", "3", "5", "8", "5", "4", 
    "2"), Region = c("West", "West", "East", "East", "East", 
    "East", "West", "West")), class = "data.frame", row.names = c(NA, 
-8L))

I try the following code, but I cannot keep ID-Region pair that only have both 'High' and 'Mid'.

newdata = data %>%
    group_by(ID,Region) %>%
    filter(n() != 1 & any(Content == "High") & any(Content == "Mid") & any(Content != "Low") %>%
    ungroup

CodePudding user response:

We can filter to keep any groups that do not contain any Content == Low and that must contain High and Mid.

library(dplyr)

df %>%
  group_by(ID,Region) %>%
  filter(!any(Content == "Low") & all(c("High", "Mid") %in% Content) )

Output

  ID    Content Count Region
  <chr> <chr>   <chr> <chr> 
1 a     High    3     West  
2 a     Mid     7     West  
3 b     High    3     East  
4 b     Mid     5     East  
5 c     High    8     East  
6 c     Mid     5     East  
7 c     High    4     West  
8 c     Mid     2     West 

CodePudding user response:

We can change the filter that shows only 'High', 'Mid' values by wrapping with all

library(dplyr)
data %>%
    group_by(ID,Region) %>%    
   filter(all(c("High", "Mid") %in% Content), 
          all(Content %in% c("High", "Mid"))) %>%
    ungroup

-output

# A tibble: 8 × 4
  ID    Content Count Region
  <chr> <chr>   <chr> <chr> 
1 a     High    3     West  
2 a     Mid     7     West  
3 b     High    3     East  
4 b     Mid     5     East  
5 c     High    8     East  
6 c     Mid     5     East  
7 c     High    4     West  
8 c     Mid     2     West  

Or use n_distinct

data %>%
   group_by(ID, Region) %>%
   filter(n_distinct(Content) == 2, 
          all(c("High", "Mid")  %in% Content)) %>% 
   ungroup

-output

# A tibble: 8 × 4
  ID    Content Count Region
  <chr> <chr>   <chr> <chr> 
1 a     High    3     West  
2 a     Mid     7     West  
3 b     High    3     East  
4 b     Mid     5     East  
5 c     High    8     East  
6 c     Mid     5     East  
7 c     High    4     West  
8 c     Mid     2     West  

CodePudding user response:

Update to consider and implement suggestion by @AndrewGB's comment(1 and 2):

library(tidyr)
library(dplyr)

df %>%
  group_by(ID, Region) %>%
    mutate(helper = ifelse(all(Content == "Mid") | 
                           all(Content == "High") | 
                           Content == "Low", "1", NA_character_)) %>% 
  fill(helper, .direction = "up") %>% 
  filter(is.na(helper)) %>% 
  select(-helper)

First answer: Do it this way:

library(dplyr)

df %>%
  group_by(ID, Region) %>%
  filter(!any(Content == "Low"))
  ID    Content Count Region
  <chr> <chr>   <int> <chr> 
1 a     High        3 West  
2 a     Mid         7 West  
3 b     High        3 East  
4 b     Mid         5 East  
5 C     High        8 East  
6 C     Mid         5 East  
7 C     High        4 West  
8 C     Mid         2 West  
  • Related