Home > Software design >  replacing Na value of a dataFrame without using a loop
replacing Na value of a dataFrame without using a loop

Time:12-28

I have this data frame:

DATE EVENT EQUIPMENT MASK
2021/12/08 enterEquipment L1 a
2021/12/09 Decouple NA a
2021/12/09 stateChange NA a
2021/12/10 leaveEquipment L1 a
2021/12/10 enterEquipment L1 b
2021/12/11 Decouple NA b
2021/12/11 stateChange NA b
2021/12/11 leaveEquipment L1 b

I would like to create a new column that contains the data of the equipment column and replace the NA Values ​​with the box above. i wrote this code

DC<-df[order(df$MASK,df$TIMESTAMP),]
Equipement_Correct <- df$EQUIPMENT

Equipement_Correct <-ifelse(is.na(as.character(Equipement_Correct [2:nrow(DC)]))&as.character(DC$MASK[2:nrow(DC)])==as.character(DC$MASK[1:nrow(DC)-1])  & (as.character(DC$EVENT[2:nrow(DC)])=='decouple' | as.character(DC$EVENT[2:nrow(DC)])=='stateChange' |,
                                           as.character(Equipement_Correct [1:nrow(DC)-1]),
                                           as.character(Equipement_Correct [2:nrow(DC)]))
Equipement_Correct <-c(NA,Equipement_Correct )


dfOut<-data.frame(DC,Equipement_Correct)

and I get:

DATE EVENT EQUIPMENT MASK Equipement_Correct
2021/12/08 enterEquipment L1 a L1
2021/12/09 Decouple NA a L1
2021/12/09 stateChange NA a NA
2021/12/10 leaveEquipment L1 a L1
2021/12/10 enterEquipment L2 b L1
2021/12/11 Decouple NA b L2
2021/12/11 stateChange NA b NA
2021/12/11 leaveEquipment L1 b L2

and I want to get:

DATE EVENT EQUIPMENT MASK Equipement_Correct
2021/12/08 enterEquipment L1 a L1
2021/12/09 Decouple NA a L1
2021/12/09 stateChange NA a L1
2021/12/10 leaveEquipment L1 a L1
2021/12/10 enterEquipment L2 b L1
2021/12/11 Decouple NA b L2
2021/12/11 stateChange NA b L2
2021/12/11 leaveEquipment L1 b L2

by using a loop on my code I manage to get the desired result but I would like not to use a loop because I work on millions of lines,

for (i in 1:5) {
Equipement_Correct <-ifelse(is.na(as.character(Equipement_Correct [2:nrow(DC)]))&as.character(DC$MASK[2:nrow(DC)])==as.character(DC$MASK[1:nrow(DC)-1])  & (as.character(DC$EVENT[2:nrow(DC)])=='decouple' | as.character(DC$EVENT[2:nrow(DC)])=='stateChange' |,
                                           as.character(Equipement_Correct [1:nrow(DC)-1]),
                                           as.character(Equipement_Correct [2:nrow(DC)]))
Equipement_Correct <-c(NA,Equipement_Correct )
}

Is there a way not to use a loop?

CodePudding user response:

Assuming the EVENT entries in the data base are ordered and complete as shown, you could coerce to factor and back as.integer, which in this case gives EVENT_N with c(1, 2, 3, 4) repeating "forever". Now, you may exploit the cumsum which we calculate on the zero values of modulo %% 4, i.e. the cumulative sum of the resulting TRUE's. We need to subtract - 1 to get the lag right.

lv <- c("enterEquipment", "Decouple", "stateChange", "leaveEquipment")

dat <- within(dat, {
  EVENT_N <- as.integer(factor(EVENT, levels=lv))
  Equipement_Correct <- paste0('L', cumsum(EVENT_N %% 4 - 1 == 0))
  rm(EVENT_N)  ## removes temporary column
})
dat
#         DATE          EVENT EQUIPMENT MASK Equipement_Correct
# 1 2021/12/08 enterEquipment        L1    a                 L1
# 2 2021/12/09       Decouple      <NA>    a                 L1
# 3 2021/12/09    stateChange      <NA>    a                 L1
# 4 2021/12/10 leaveEquipment        L1    a                 L1
# 5 2021/12/10 enterEquipment        L2    b                 L2
# 6 2021/12/11       Decouple      <NA>    b                 L2
# 7 2021/12/11    stateChange      <NA>    b                 L2
# 8 2021/12/11 leaveEquipment        L1    b                 L2

Maybe it's faster to use a Key data frame instead of factor.

K <- data.frame(matrix(c("enterEquipment", "Decouple", "stateChange", 
                         "leaveEquipment", 1:4), ncol=2)) |> 
  type.convert(as.is=TRUE)

within(dat, {
  Equipement_Correct <- 
    paste0('L', cumsum(K[match(EVENT, K[, 1]), 2] %% 4 - 1 == 0))
})

Data:

dat <- structure(list(DATE = c("2021/12/08", "2021/12/09", "2021/12/09", 
"2021/12/10", "2021/12/10", "2021/12/11", "2021/12/11", "2021/12/11"
), EVENT = c("enterEquipment", "Decouple", "stateChange", "leaveEquipment", 
"enterEquipment", "Decouple", "stateChange", "leaveEquipment"
), EQUIPMENT = c("L1", NA, NA, "L1", "L2", NA, NA, "L1"), MASK = c("a", 
"a", "a", "a", "b", "b", "b", "b"), Equipement_Correct = c("L1", 
"L1", NA, "L1", "L1", "L2", NA, "L2")), class = "data.frame", row.names = c(NA, 
-8L))

CodePudding user response:

Use this package 'zoo' where you get function na.locf (last observation carry forward)

install.packages(zoo)
library(zoo)

data$EQUIPMENT <- na.locf(data$EQUIPMENT)

eg: > library(zoo)
> data <- data.frame(New= c("L1",NA,NA,"L2",NA,"L3",NA,NA), old = 1:8)
> data
   New old
1   L1   1
2 <NA>   2
3 <NA>   3
4   L2   4
5 <NA>   5
6   L3   6
7 <NA>   7
8 <NA>   8
> data$New <- na.locf(data$New)
> data
  New old
1  L1   1
2  L1   2
3  L1   3
4  L2   4
5  L2   5
6  L3   6
7  L3   7
8  L3   8

CodePudding user response:

You can also use tidyr::fill for this purpose:

library(tidyr)

df %>%
  fill(Equipement_Correct, .direction = "down")

        DATE          EVENT EQUIPMENT MASK Equipement_Correct
1 2021/12/08 enterEquipment        L1    a                 L1
2 2021/12/09       Decouple      <NA>    a                 L1
3 2021/12/09    stateChange      <NA>    a                 L1
4 2021/12/10 leaveEquipment        L1    a                 L1
5 2021/12/10 enterEquipment        L2    b                 L1
6 2021/12/11       Decouple      <NA>    b                 L2
7 2021/12/11    stateChange      <NA>    b                 L2
8 2021/12/11 leaveEquipment        L1    b                 L2
  •  Tags:  
  • r
  • Related