Home > Software design >  Convert to NA after a specific value by row
Convert to NA after a specific value by row

Time:09-21

Imagine the following data frame:

#  ID v1 v2 v3 v4
#1  H  0  0  d  0
#2  I  0  0  0  0
#3  J  d  0  0  0
#4  K  0  0  0  d
#5  L  0  d  0  0

There is either one or no "d" per row.

For each row, I want to convert everything after d to NA. Desired result:

#  ID v1  v2  v3  v4
#1  H  0   0   d  NA
#2  I  0   0   0   0
#3  J  d  NA  NA  NA
#4  K  0   0   0   d
#5  L  0   d  NA  NA

DATA

df <- data.frame(ID = LETTERS[8:12], 
                 v1 = c(0, 0, 'd', 0, 0), 
                 v2 = c(0, 0, 0, 0, 'd'), 
                 v3 = c('d', 0, 0, 0, 0), 
                 v4 = c(0, 0, 0, 'd', 0), 
      stringsAsFactors = FALSE)

CodePudding user response:

Using cummax:

ix = df == "d"
df[t(apply(ix, 1, cummax)) & !ix] = NA
#   ID v1   v2   v3   v4
# 1  H  0    0    d <NA>
# 2  I  0    0    0    0
# 3  J  d <NA> <NA> <NA>
# 4  K  0    0    0    d
# 5  L  0    d <NA> <NA>

To increase speed, replace apply with collapse::dapply:

ix = df == "d"
df[collapse::dapply(ix, cummax, MARGIN = 1) & !ix] = NA

Or use matrixStats::rowCummaxs:

ix = df == "d"
df[rowCummaxs( ix) & !ix] = NA

CodePudding user response:

Two alternative solutions:

# option 1
w <- which(df == "d", arr.ind = TRUE)
w <- w[w[,2] < ncol(df),]
reps <- ncol(df) - w[,2]
w <- w[rep(1:nrow(w), reps),]
w[,2] <- w[,2]   unlist(sapply(reps, seq))

df[w] <- NA

# option 2
mc <- ncol(df) - max.col(df == "d", ties.method = "first")
mc[mc >= (ncol(df) - 1)] <- 0
rr <- rep(seq_along(mc), mc)
cc <- rep(ncol(df) - mc, mc)   unlist(sapply(mc, seq)[mc > 0])

df[cbind(rr, cc)] <- NA

which both also give the desired result.

CodePudding user response:

My version for solving it is:

f1 <- function(x){
  i1 <- which(x == 'd')   1
  cond <- length(i1) > 0 && i1 <= length(x)
  if (cond){x[i1:(length(x))] <- NA;x}else{x}
}
df[-1] <- t(apply(df[-1], 1, f1))

which gives,

#  ID v1   v2   v3   v4
#1  H  0    0    d <NA>
#2  I  0    0    0    0
#3  J  d <NA> <NA> <NA>
#4  K  0    0    0    d
#5  L  0    d <NA> <NA>

CodePudding user response:

Here are two base R one-liners.

1) Reduce Because this operates on entire columns at a time instead of row by row it should be particularly fast if there are many rows and not many columns.

replace(df, TRUE, Reduce(function(x, y) ifelse(x == "d", NA, y), df, acc = TRUE))

giving:

  ID v1   v2   v3   v4
1  H  0    0    d <NA>
2  I  0    0    0    0
3  J  d <NA> <NA> <NA>
4  K  0    0    0    d
5  L  0    d <NA> <NA>

2) read.table This assumes that the only occurrences of d are in cells consisting of a single d (which is the case for the example in the question).

replace(df, df!="d"&is.na(read.table(text=do.call(paste,df), comment="d", fill=NA)), NA)

giving:

  ID v1   v2   v3   v4
1  H  0    0    d <NA>
2  I  0    0    0    0
3  J  d <NA> <NA> <NA>
4  K  0    0    0    d
5  L  0    d <NA> <NA>

CodePudding user response:

Some alternative with :

library(data.table)
setDT(df)

df[, names(df)[-1] := {x <- unlist(.SD)
                      if(any(x=="d")) { # if there's no "d", no need to do anything
                            whd <- which(x=="d")
                            if(whd != length(x)) { # if "d" is at the end, nothing to be done either
                                x[(whd 1):length(x)] <- NA
                            }
                       }
                       as.list(x)}, # return the line as a list so the structure is kept
       .SDcols=-1, by=1:nrow(df)] # you need to do a "by row" operation

CodePudding user response:

Another version using col and max.col:

df[-1][col(df[-1]) > max.col(df[-1] == "d", "last")] <- NA
df

#  ID v1   v2   v3   v4
#1  H  0    0    d <NA>
#2  I  0    0    0    0
#3  J  d <NA> <NA> <NA>
#4  K  0    0    0    d
#5  L  0    d <NA> <NA>

CodePudding user response:

One option utilizing dplyr and purrr could be:

df %>%
    mutate(pmap_dfr(across(-ID), ~ `[<-`(c(...), seq_along(c(...)) > which(c(...) == "d"), NA)))

  ID v1   v2   v3   v4
1  H  0    0    d <NA>
2  I  0    0    0    0
3  J  d <NA> <NA> <NA>
4  K  0    0    0    d
5  L  0    d <NA> <NA>

CodePudding user response:

Another dplyr solution

df %>% 
  pivot_longer(cols = starts_with("v")) %>%
  group_by(ID) %>%
  mutate(i = as.logical(cumsum((lag(value == "d", default = FALSE))))) %>%
  mutate(value = replace(value, i, NA)) %>%
  select(-i) %>%
  pivot_wider(id_cols = ID)

CodePudding user response:

I think your version of solving this is quite concise. I tried to come up with something from the tidyverse. Its a nested across call which uses cur_column inside str_replace so despite being "tidy" I think your base version is more readable.

library(dplyr)
library(stringr)

df %>% 
  mutate(across(v2:v4, 
                ~ ifelse(
                  rowSums(across("v1":str_replace(cur_column(), "\\d", function(y) as.numeric(y)-1)) == "d") > 0,
                  NA,
                  .x)
                )
         )
#>   ID v1   v2   v3   v4
#> 1  H  0    0    d <NA>
#> 2  I  0    0    0    0
#> 3  J  d <NA> <NA> <NA>
#> 4  K  0    0    0    d
#> 5  L  0    d <NA> <NA>

Created on 2021-09-20 by the reprex package (v2.0.1)

CodePudding user response:

base

df <- data.frame(ID = LETTERS[8:12], 
                 v1 = c(0, 0, 'd', 0, 0), 
                 v2 = c(0, 0, 0, 0, 'd'), 
                 v3 = c('d', 0, 0, 0, 0), 
                 v4 = c(0, 0, 0, 'd', 0), 
                 stringsAsFactors = FALSE)

df[df == "d"] <- 1
l <- max.col(df[-1], ties.method = "first")
tmp <- t(sapply(l, function(x) seq(ncol(df[-1])) > x))
df[-1][tmp] <- NA
df
#>   ID v1   v2   v3   v4
#> 1  H  0    0    1 <NA>
#> 2  I  0 <NA> <NA> <NA>
#> 3  J  1 <NA> <NA> <NA>
#> 4  K  0    0    0    1
#> 5  L  0    1 <NA> <NA>

Created on 2021-09-20 by the reprex package (v2.0.1)

CodePudding user response:

An approach using lagged boolean rowCumsums.

m <- as.matrix(df)
s <- which(as.logical(matrixStats::rowCumsums(m == 'd')))
m[,-1][`length<-`(s, length(s) - nrow(m)   1)] <- NA
as.data.frame(array(m, dim(df)))
#   V1 V2   V3   V4   V5
# 1  H  0    0    d <NA>
# 2  I  0    0    0    0
# 3  J  d <NA> <NA> <NA>
# 4  K  0    0    0    d
# 5  L  0    d <NA> <NA>

CodePudding user response:

Another way using | in Reduce assuming that the first column is the ID and does not have a d.

is.na(df[-1]) <- do.call(cbind, Reduce(`|`, lapply(df[-nrow(df)], `==`, "d")
                                     , accumulate = TRUE))

df
#  ID v1   v2   v3   v4
#1  H  0    0    d <NA>
#2  I  0    0    0    0
#3  J  d <NA> <NA> <NA>
#4  K  0    0    0    d
#5  L  0    d <NA> <NA>

Or using which and sequence. With getting arr.ind from which,

i <- which(df[c(-1, -ncol(df))] == "d", arr.ind = TRUE)
n <- ncol(df) - i[,2] - 1
is.na(df) <- cbind(rep(i[,1], n), sequence(n, 2   i[,2]))

or using arrayInd.

i <- which(df[c(-1, -ncol(df))] == "d")
n <- ncol(df) - (i-1) %/% nrow(df) - 2
is.na(df) <- arrayInd(sequence(n, i   2*nrow(df), nrow(df)), dim(df))
  • Related