DATA = data.frame(STUDENT = c(1,1,1,1,1,2,2,2,2,3,3,3),
YEAR = c(2000,2000,2001,2001,2002,2000,2001,2001,2002,2000,2001,2001),
SEMESTER = c(1,2,1,2,1,2,1,2,1,1,2,1),
SCORE = c(7,4,5,6,8,9,1,1,1,2,3,4),
WANT= c(NA, NA, 1, NA, 1, NA, 0, NA, 2, NA, 1, NA),
WANT2=c(NA, NA, 1, NA, 2, NA, -8, NA, 0, NA, 1, NA))
I have 'DATA' and wish to create the 'WANT' variable which does this:
if SCORE from SEMESTER = 1 and YEAR = N > SCORE from SEMESTER = 2 and YEAR = N-1, 1
if SCORE from SEMESTER = 1 and YEAR = N < SCORE from SEMESTER = 2 and YEAR = N-1, 2
if SCORE from SEMESTER = 1 and YEAR = N = SCORE from SEMESTER = 2 and YEAR = N-1, 0
CodePudding user response:
If I'm understanding your question correctly, this could be one way of doing it:
library(tidyverse)
DATA = data.frame(STUDENT = c(1,1,1,1,1,2,2,2,2,3,3,3),
YEAR = c(2000,2000,2001,2001,2002,2000,2001,2001,2002,2000,2001,2001),
SEMESTER = c(1,2,1,2,1,2,1,2,1,1,2,1),
SCORE = c(7,4,5,6,8,9,1,1,1,2,3,4),
WANT= c(NA, NA, 1, NA, 1, NA, 0, NA, 2, NA, 1, NA))
DATA |>
# These lines fill in 'missing' semesters
complete(STUDENT, YEAR, SEMESTER) |>
arrange(STUDENT, YEAR, SEMESTER) |>
group_by(STUDENT) |>
# These lines check 'last score' for each student
mutate(WANT = case_when(
SEMESTER == 2 ~ NA,
SCORE > lag(SCORE) ~ 1,
SCORE < lag(SCORE) ~ 2,
SCORE == lag(SCORE) ~ 0
)
) |>
# These lines re-shorten code to only those containing scores
filter(!is.na(SCORE))
#> # A tibble: 12 × 5
#> # Groups: STUDENT [3]
#> STUDENT YEAR SEMESTER SCORE WANT
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2000 1 7 NA
#> 2 1 2000 2 4 NA
#> 3 1 2001 1 5 1
#> 4 1 2001 2 6 NA
#> 5 1 2002 1 8 1
#> 6 2 2000 2 9 NA
#> 7 2 2001 1 1 2
#> 8 2 2001 2 1 NA
#> 9 2 2002 1 1 0
#> 10 3 2000 1 2 NA
#> 11 3 2001 1 4 NA
#> 12 3 2001 2 3 NA
CodePudding user response:
Using dplyr
with case_when
:
DATA %>%
group_by(STUDENT) %>%
arrange(YEAR, SEMESTER) %>%
mutate(WANT = case_when(SEMESTER == 1 & lag(SEMESTER) == 2 & YEAR == lag(YEAR) 1 & SCORE > lag(SCORE) ~ 1,
SEMESTER == 1 & lag(SEMESTER) == 2 & YEAR == lag(YEAR) 1 & SCORE < lag(SCORE) ~ 2,
SEMESTER == 1 & lag(SEMESTER) == 2 & YEAR == lag(YEAR) 1 & SCORE == lag(SCORE) ~ 0)) %>%
arrange(STUDENT)
Result:
STUDENT YEAR SEMESTER SCORE WANT
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2000 1 7 NA
2 1 2000 2 4 NA
3 1 2001 1 5 1
4 1 2001 2 6 NA
5 1 2002 1 8 1
6 2 2000 2 9 NA
7 2 2001 1 1 2
8 2 2001 2 1 NA
9 2 2002 1 1 0
10 3 2000 1 2 NA
11 3 2001 1 4 NA
12 3 2001 2 3 NA
CodePudding user response:
Not exactly your expected results but i think you need to use pivot_wider
DATA |>
select(STUDENT, YEAR, SEMESTER, SCORE) |>
pivot_wider(names_from = SEMESTER, values_from = SCORE) |>
complete(YEAR) |>
arrange(YEAR) |>
group_by(STUDENT) |>
mutate(CHANGE = case_when(`1` > lag(`2`,1) ~ 1,
`1` < lag(`2`,1) ~ 2,
`1` == lag(`2`,1) ~ 0,
TRUE ~ NA_real_)) |>
ungroup() |>
arrange(STUDENT, YEAR)
##> # A tibble: 8 × 5
##> YEAR STUDENT `1` `2` CHANGE
##> <dbl> <dbl> <dbl> <dbl> <dbl>
##> 1 2000 1 7 4 NA
##> 2 2001 1 5 6 1
##> 3 2002 1 8 NA 1
##> 4 2000 2 NA 9 NA
##> 5 2001 2 1 1 2
##> 6 2002 2 1 NA 0
##> 7 2000 3 2 NA NA
##> 8 2001 3 4 3 NA