I have this data frame:
DATE | EVENT | EQUIPMENT | MASK |
---|---|---|---|
2021/12/08 | enterEquipment | L1 | a |
2021/12/09 | Decouple | NA | a |
2021/12/09 | stateChange | NA | a |
2021/12/10 | leaveEquipment | L1 | a |
2021/12/10 | enterEquipment | L1 | b |
2021/12/11 | Decouple | NA | b |
2021/12/11 | stateChange | NA | b |
2021/12/11 | leaveEquipment | L1 | b |
I would like to create a new column that contains the data of the equipment column and replace the NA Values with the box above. i wrote this code
DC<-df[order(df$MASK,df$TIMESTAMP),]
Equipement_Correct <- df$EQUIPMENT
Equipement_Correct <-ifelse(is.na(as.character(Equipement_Correct [2:nrow(DC)]))&as.character(DC$MASK[2:nrow(DC)])==as.character(DC$MASK[1:nrow(DC)-1]) & (as.character(DC$EVENT[2:nrow(DC)])=='decouple' | as.character(DC$EVENT[2:nrow(DC)])=='stateChange' |,
as.character(Equipement_Correct [1:nrow(DC)-1]),
as.character(Equipement_Correct [2:nrow(DC)]))
Equipement_Correct <-c(NA,Equipement_Correct )
dfOut<-data.frame(DC,Equipement_Correct)
and I get:
DATE | EVENT | EQUIPMENT | MASK | Equipement_Correct |
---|---|---|---|---|
2021/12/08 | enterEquipment | L1 | a | L1 |
2021/12/09 | Decouple | NA | a | L1 |
2021/12/09 | stateChange | NA | a | NA |
2021/12/10 | leaveEquipment | L1 | a | L1 |
2021/12/10 | enterEquipment | L2 | b | L1 |
2021/12/11 | Decouple | NA | b | L2 |
2021/12/11 | stateChange | NA | b | NA |
2021/12/11 | leaveEquipment | L1 | b | L2 |
and I want to get:
DATE | EVENT | EQUIPMENT | MASK | Equipement_Correct |
---|---|---|---|---|
2021/12/08 | enterEquipment | L1 | a | L1 |
2021/12/09 | Decouple | NA | a | L1 |
2021/12/09 | stateChange | NA | a | L1 |
2021/12/10 | leaveEquipment | L1 | a | L1 |
2021/12/10 | enterEquipment | L2 | b | L1 |
2021/12/11 | Decouple | NA | b | L2 |
2021/12/11 | stateChange | NA | b | L2 |
2021/12/11 | leaveEquipment | L1 | b | L2 |
by using a loop on my code I manage to get the desired result but I would like not to use a loop because I work on millions of lines,
for (i in 1:5) {
Equipement_Correct <-ifelse(is.na(as.character(Equipement_Correct [2:nrow(DC)]))&as.character(DC$MASK[2:nrow(DC)])==as.character(DC$MASK[1:nrow(DC)-1]) & (as.character(DC$EVENT[2:nrow(DC)])=='decouple' | as.character(DC$EVENT[2:nrow(DC)])=='stateChange' |,
as.character(Equipement_Correct [1:nrow(DC)-1]),
as.character(Equipement_Correct [2:nrow(DC)]))
Equipement_Correct <-c(NA,Equipement_Correct )
}
Is there a way not to use a loop?
CodePudding user response:
Assuming the EVENT entries in the data base are ordered and complete as shown, you could coerce to factor
and back as.integer
, which in this case gives EVENT_N
with c(1, 2, 3, 4)
repeating "forever". Now, you may exploit the cumsum
which we calculate on the zero values of modulo %% 4
, i.e. the cumulative sum of the resulting TRUE
's. We need to subtract - 1
to get the lag right.
lv <- c("enterEquipment", "Decouple", "stateChange", "leaveEquipment")
dat <- within(dat, {
EVENT_N <- as.integer(factor(EVENT, levels=lv))
Equipement_Correct <- paste0('L', cumsum(EVENT_N %% 4 - 1 == 0))
rm(EVENT_N) ## removes temporary column
})
dat
# DATE EVENT EQUIPMENT MASK Equipement_Correct
# 1 2021/12/08 enterEquipment L1 a L1
# 2 2021/12/09 Decouple <NA> a L1
# 3 2021/12/09 stateChange <NA> a L1
# 4 2021/12/10 leaveEquipment L1 a L1
# 5 2021/12/10 enterEquipment L2 b L2
# 6 2021/12/11 Decouple <NA> b L2
# 7 2021/12/11 stateChange <NA> b L2
# 8 2021/12/11 leaveEquipment L1 b L2
Maybe it's faster to use a K
ey data frame instead of factor
.
K <- data.frame(matrix(c("enterEquipment", "Decouple", "stateChange",
"leaveEquipment", 1:4), ncol=2)) |>
type.convert(as.is=TRUE)
within(dat, {
Equipement_Correct <-
paste0('L', cumsum(K[match(EVENT, K[, 1]), 2] %% 4 - 1 == 0))
})
Data:
dat <- structure(list(DATE = c("2021/12/08", "2021/12/09", "2021/12/09",
"2021/12/10", "2021/12/10", "2021/12/11", "2021/12/11", "2021/12/11"
), EVENT = c("enterEquipment", "Decouple", "stateChange", "leaveEquipment",
"enterEquipment", "Decouple", "stateChange", "leaveEquipment"
), EQUIPMENT = c("L1", NA, NA, "L1", "L2", NA, NA, "L1"), MASK = c("a",
"a", "a", "a", "b", "b", "b", "b"), Equipement_Correct = c("L1",
"L1", NA, "L1", "L1", "L2", NA, "L2")), class = "data.frame", row.names = c(NA,
-8L))
CodePudding user response:
Use this package 'zoo' where you get function na.locf (last observation carry forward)
install.packages(zoo)
library(zoo)
data$EQUIPMENT <- na.locf(data$EQUIPMENT)
eg: > library(zoo)
> data <- data.frame(New= c("L1",NA,NA,"L2",NA,"L3",NA,NA), old = 1:8)
> data
New old
1 L1 1
2 <NA> 2
3 <NA> 3
4 L2 4
5 <NA> 5
6 L3 6
7 <NA> 7
8 <NA> 8
> data$New <- na.locf(data$New)
> data
New old
1 L1 1
2 L1 2
3 L1 3
4 L2 4
5 L2 5
6 L3 6
7 L3 7
8 L3 8
CodePudding user response:
You can also use tidyr::fill
for this purpose:
library(tidyr)
df %>%
fill(Equipement_Correct, .direction = "down")
DATE EVENT EQUIPMENT MASK Equipement_Correct
1 2021/12/08 enterEquipment L1 a L1
2 2021/12/09 Decouple <NA> a L1
3 2021/12/09 stateChange <NA> a L1
4 2021/12/10 leaveEquipment L1 a L1
5 2021/12/10 enterEquipment L2 b L1
6 2021/12/11 Decouple <NA> b L2
7 2021/12/11 stateChange <NA> b L2
8 2021/12/11 leaveEquipment L1 b L2