Home > Enterprise >  R DUMMY VARIABLE
R DUMMY VARIABLE

Time:05-12

 DATA=data.frame(   STUDENT=c(1 ,1 ,1 ,1 ,1 ,1 ,1 ,2 ,2 ,2 ,2 ,2 ,2),
    SCORE=c(92 ,64 ,83 ,78 ,69 ,52 ,100 ,69 ,76 ,100 ,74 ,75 ,56),
    DATE=c("1/21/2000" ,"1/21/2000" ,"5/29/2000" ,"7/10/2000" ,"7/22/2000" ,"7/22/2000" ,"8/10/2000" ,"2/20/2000" ,"8/29/2000" ,"9/15/2000" ,"3/7/2001" ,"7/7/2001" ,"8/18/2001"),
    NEW1=c(1 ,0 ,1 ,1 ,1 ,0 ,1 ,1 ,1 ,1 ,1 ,1 ,1),
    NEW2=c(NA ,-99 ,129 ,42 ,12 ,-99 ,19 ,NA ,191 ,17 ,173 ,122 ,42)))

I have 'DATA' and wish to create 'NEW1' and also 'NEW2' and 'NEW1' is basically 1 for the unique combo of 'STUDENT' and 'DATE' or 0 if the combination repeats basically. And 'NEW2' is you take the subtraction of the dates for EACH 'STUDENT' to calculate days between tests BUT you skip the ones where there is a 0 IN 'NEW1' since it is duplicate.

CodePudding user response:

with dplyr:

df %>% 
  mutate(DATE=as.Date(DATE,"%m/%d/%Y")) %>% 
  arrange(DATE) %>% 
  group_by(STUDENT,DATE) %>% 
  mutate(NEW1 = as.numeric(row_number()==1)) %>%  
  group_by(STUDENT) %>% 
  mutate(NEW2 = if_else(NEW1==1, as.numeric(DATE-lag(DATE)),-99))

Output:

   STUDENT SCORE DATE        NEW1  NEW2
     <dbl> <dbl> <date>     <dbl> <dbl>
 1       1    92 2000-01-21     1    NA
 2       1    64 2000-01-21     0   -99
 3       2    69 2000-02-20     1    NA
 4       1    83 2000-05-29     1   129
 5       1    78 2000-07-10     1    42
 6       1    69 2000-07-22     1    12
 7       1    52 2000-07-22     0   -99
 8       1   100 2000-08-10     1    19
 9       2    76 2000-08-29     1   191
10       2   100 2000-09-15     1    17
11       2    74 2001-03-07     1   173
12       2    75 2001-07-07     1   122
13       2    56 2001-08-18     1    42

Here is a rather verbose data.table approach:

library(data.table)

setDT(df)[, DATE:=as.Date(DATE,"%m/%d/%Y")] %>% 
  .[order(STUDENT,DATE),id:=1:.N, by=.(STUDENT,DATE)] %>% 
  .[,NEW1:=as.numeric(id==1)] %>% 
  .[,NEW2:=fifelse(id==1, as.numeric(DATE-lag(DATE)), -99),by=STUDENT] %>% 
  .[,id:=NULL] %>% 
  .[]

Output:

    STUDENT SCORE       DATE NEW1 NEW2
 1:       1    92 2000-01-21    1   NA
 2:       1    64 2000-01-21    0  -99
 3:       1    83 2000-05-29    1  129
 4:       1    78 2000-07-10    1   42
 5:       1    69 2000-07-22    1   12
 6:       1    52 2000-07-22    0  -99
 7:       1   100 2000-08-10    1   19
 8:       2    69 2000-02-20    1   NA
 9:       2    76 2000-08-29    1  191
10:       2   100 2000-09-15    1   17
11:       2    74 2001-03-07    1  173
12:       2    75 2001-07-07    1  122
13:       2    56 2001-08-18    1   42

CodePudding user response:

Try this using tibbles (from library(tidyverse)):

data=tibble(STUDENT=c(1 ,1 ,1 ,1 ,1 ,1 ,1 ,2 ,2 ,2 ,2 ,2 ,2),
            SCORE=c(92 ,64 ,83 ,78 ,69 ,52 ,100 ,69 ,76 ,100 ,74 ,75 ,56),
            DATE=c("1/21/2000" ,"1/21/2000" ,"5/29/2000" ,"7/10/2000" ,"7/22/2000" ,"7/22/2000" ,"8/10/2000" ,"2/20/2000" ,"8/29/2000" ,"9/15/2000" ,"3/7/2001" ,"7/7/2001" ,"8/18/2001"),
            NEW1=c(1 ,0 ,1 ,1 ,1 ,0 ,1 ,1 ,1 ,1 ,1 ,1 ,1),
            NEW2=c(NA ,-99 ,129 ,42 ,12 ,-99 ,19 ,NA ,191 ,17 ,173 ,122 ,42))

data %>% 
  clean_names %>% 
  mutate(dupe_indicator = ifelse(student == lag(student) & date == lag(date), 0, 1)) %>% 
  mutate(time_diff = ifelse(dupe_indicator == 1 & student == lag(student), mdy(date) - mdy(lag(date)), "NA"))
  • Related