How to perform condition for specific ranks in R-CodePudding

This is example of my dataset

timeplot=structure(list(SAP = c("s643", "s643", "s643", "s643", "s643", 
"s643", "s643", "s643", "s643", "s643", "s643", "s985", "s985", 
"s985", "s985", "s985", "s985", "s985", "s985", "s985", "s985", 
"s985", "s985", "s985"), hour = c(" 09 hour ", " 10 hour ", " 11 hour ", 
" 19 hour ", " 18 hour ", " 12 hour ", " 17 hour ", " 15 hour ", 
" 14 hour ", " 16 hour ", " 13 hour ", " 22 hour ", " 10 hour ", 
" 21 hour ", " 11 hour ", " 18 hour ", " 19 hour ", " 12 hour ", 
" 16 hour ", " 17 hour ", " 13 hour ", " 15 hour ", " 20 hour ", 
" 14 hour "), value = c("3,6", "51685,42769", "85539,56302", 
"96901,46277", "125787,0896", "128681,5323", "136605,3676", "143449,3214", 
"145656,5062", "147891,4464", "207335,5267", "-1204,704396", 
"60284,83147", "107269,9123", "117911,4336", "155052,8752", "163971,1803", 
"177794,5036", "184859,2897", "212647,2968", "233137,2999", "244140,2991", 
"262414,2152", "320948,6083"), rank = c(1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 11L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 
11L, 12L, 13L)), class = "data.frame", row.names = c(NA, -24L
))

How to do that if the first rank is at the very first hour, for example, in this case for SAP = s643 min hour=9 and max hour=19,and we see that the first rank is at the very minimum hour(9), then put the flag "removed". Also, if there is a situation that the first rank is in the last hour(like sap=s985), then we also set the flag "remove". If the first rank for hours is in the range from 11-18, we do not set any flag.

I.E. in this example correct desired output (for each SAP separately , cause for each sap, the number of available hours is different.)

SAP hour    value   rank                 Flag
s643     09 hour    3,6         1   "removed"
s643     10 hour    51685,42769 2   
s643     11 hour    85539,56302 3   
s643     19 hour    96901,46277 4   
s643     18 hour    125787,0896 5   
s643     12 hour    128681,5323 6   
s643     17 hour    136605,3676 7   
s643     15 hour    143449,3214 8   
s643     14 hour    145656,5062 9   
s643     16 hour    147891,4464 10  
s643     13 hour    207335,5267 11  
s985     22 hour    -1204,704396    1   "removed"
s985     10 hour    60284,83147 2   
s985     21 hour    107269,9123 3   
s985     11 hour    117911,4336 4   
s985     18 hour    155052,8752 5   
s985     19 hour    163971,1803 6   
s985     12 hour    177794,5036 7   
s985     16 hour    184859,2897 8   
s985     17 hour    212647,2968 9   
s985     13 hour    233137,2999 10  
s985     15 hour    244140,2991 11  
s985     20 hour    262414,2152 12  
s985     14 hour    320948,6083 13

How can i get desired output? Thanks for your help.

CodePudding user response：

We can use a case_when() statement to evaluate your condition and set Flag.

Make sure we arrange(hour) before mutate(), this guarantee the use of first() and last() in case_when() is accurate.

library(dplyr)

timeplot %>% 
  group_by(SAP) %>% 
  arrange(hour) %>% 
  mutate(Flag = case_when(rank == 1 & (hour == first(hour) | hour == last(hour)) ~ "removed",
                          TRUE ~ " "))
#> # A tibble: 24 × 5
#> # Groups:   SAP [2]
#>    SAP   hour        value        rank Flag     
#>    <chr> <chr>       <chr>       <int> <chr>    
#>  1 s643  " 09 hour " 3,6             1 "removed"
#>  2 s643  " 10 hour " 51685,42769     2 " "      
#>  3 s985  " 10 hour " 60284,83147     2 " "      
#>  4 s643  " 11 hour " 85539,56302     3 " "      
#>  5 s985  " 11 hour " 117911,4336     4 " "      
#>  6 s643  " 12 hour " 128681,5323     6 " "      
#>  7 s985  " 12 hour " 177794,5036     7 " "      
#>  8 s643  " 13 hour " 207335,5267    11 " "      
#>  9 s985  " 13 hour " 233137,2999    10 " "      
#> 10 s643  " 14 hour " 145656,5062     9 " "      
#> # … with 14 more rows

CodePudding user response：

Use mutate() and group_by() from {dplyr} package within the {tidyverse}. then use ifelse() statement

library(tidyverse)

timeplot <- timeplot %>%
  mutate(hour_digit = as.numeric(gsub("([0-9] ).*$", "\\1", hour)))


timeplot <- timeplot %>%
  group_by(SAP) %>%
  mutate(Flag = ifelse(rank == min(rank) & (hour_digit == min(hour_digit) | hour_digit == max(hour_digit)), "removed", NA )) %>%
  ungroup()