DATA=data.frame( STUDENT=c(1 ,1 ,1 ,1 ,1 ,1 ,1 ,2 ,2 ,2 ,2 ,2 ,2),
SCORE=c(92 ,64 ,83 ,78 ,69 ,52 ,100 ,69 ,76 ,100 ,74 ,75 ,56),
DATE=c("1/21/2000" ,"1/21/2000" ,"5/29/2000" ,"7/10/2000" ,"7/22/2000" ,"7/22/2000" ,"8/10/2000" ,"2/20/2000" ,"8/29/2000" ,"9/15/2000" ,"3/7/2001" ,"7/7/2001" ,"8/18/2001"),
NEW1=c(1 ,0 ,1 ,1 ,1 ,0 ,1 ,1 ,1 ,1 ,1 ,1 ,1),
NEW2=c(NA ,-99 ,129 ,42 ,12 ,-99 ,19 ,NA ,191 ,17 ,173 ,122 ,42)))
I have 'DATA' and wish to create 'NEW1' and also 'NEW2' and 'NEW1' is basically 1 for the unique combo of 'STUDENT' and 'DATE' or 0 if the combination repeats basically. And 'NEW2' is you take the subtraction of the dates for EACH 'STUDENT' to calculate days between tests BUT you skip the ones where there is a 0 IN 'NEW1' since it is duplicate.
CodePudding user response:
with dplyr
:
df %>%
mutate(DATE=as.Date(DATE,"%m/%d/%Y")) %>%
arrange(DATE) %>%
group_by(STUDENT,DATE) %>%
mutate(NEW1 = as.numeric(row_number()==1)) %>%
group_by(STUDENT) %>%
mutate(NEW2 = if_else(NEW1==1, as.numeric(DATE-lag(DATE)),-99))
Output:
STUDENT SCORE DATE NEW1 NEW2
<dbl> <dbl> <date> <dbl> <dbl>
1 1 92 2000-01-21 1 NA
2 1 64 2000-01-21 0 -99
3 2 69 2000-02-20 1 NA
4 1 83 2000-05-29 1 129
5 1 78 2000-07-10 1 42
6 1 69 2000-07-22 1 12
7 1 52 2000-07-22 0 -99
8 1 100 2000-08-10 1 19
9 2 76 2000-08-29 1 191
10 2 100 2000-09-15 1 17
11 2 74 2001-03-07 1 173
12 2 75 2001-07-07 1 122
13 2 56 2001-08-18 1 42
Here is a rather verbose data.table approach:
library(data.table)
setDT(df)[, DATE:=as.Date(DATE,"%m/%d/%Y")] %>%
.[order(STUDENT,DATE),id:=1:.N, by=.(STUDENT,DATE)] %>%
.[,NEW1:=as.numeric(id==1)] %>%
.[,NEW2:=fifelse(id==1, as.numeric(DATE-lag(DATE)), -99),by=STUDENT] %>%
.[,id:=NULL] %>%
.[]
Output:
STUDENT SCORE DATE NEW1 NEW2
1: 1 92 2000-01-21 1 NA
2: 1 64 2000-01-21 0 -99
3: 1 83 2000-05-29 1 129
4: 1 78 2000-07-10 1 42
5: 1 69 2000-07-22 1 12
6: 1 52 2000-07-22 0 -99
7: 1 100 2000-08-10 1 19
8: 2 69 2000-02-20 1 NA
9: 2 76 2000-08-29 1 191
10: 2 100 2000-09-15 1 17
11: 2 74 2001-03-07 1 173
12: 2 75 2001-07-07 1 122
13: 2 56 2001-08-18 1 42
CodePudding user response:
Try this using tibbles (from library(tidyverse)
):
data=tibble(STUDENT=c(1 ,1 ,1 ,1 ,1 ,1 ,1 ,2 ,2 ,2 ,2 ,2 ,2),
SCORE=c(92 ,64 ,83 ,78 ,69 ,52 ,100 ,69 ,76 ,100 ,74 ,75 ,56),
DATE=c("1/21/2000" ,"1/21/2000" ,"5/29/2000" ,"7/10/2000" ,"7/22/2000" ,"7/22/2000" ,"8/10/2000" ,"2/20/2000" ,"8/29/2000" ,"9/15/2000" ,"3/7/2001" ,"7/7/2001" ,"8/18/2001"),
NEW1=c(1 ,0 ,1 ,1 ,1 ,0 ,1 ,1 ,1 ,1 ,1 ,1 ,1),
NEW2=c(NA ,-99 ,129 ,42 ,12 ,-99 ,19 ,NA ,191 ,17 ,173 ,122 ,42))
data %>%
clean_names %>%
mutate(dupe_indicator = ifelse(student == lag(student) & date == lag(date), 0, 1)) %>%
mutate(time_diff = ifelse(dupe_indicator == 1 & student == lag(student), mdy(date) - mdy(lag(date)), "NA"))