Imagine the following data frame:
# ID v1 v2 v3 v4
#1 H 0 0 d 0
#2 I 0 0 0 0
#3 J d 0 0 0
#4 K 0 0 0 d
#5 L 0 d 0 0
There is either one or no "d" per row.
For each row, I want to convert everything after d
to NA
. Desired result:
# ID v1 v2 v3 v4
#1 H 0 0 d NA
#2 I 0 0 0 0
#3 J d NA NA NA
#4 K 0 0 0 d
#5 L 0 d NA NA
DATA
df <- data.frame(ID = LETTERS[8:12],
v1 = c(0, 0, 'd', 0, 0),
v2 = c(0, 0, 0, 0, 'd'),
v3 = c('d', 0, 0, 0, 0),
v4 = c(0, 0, 0, 'd', 0),
stringsAsFactors = FALSE)
CodePudding user response:
Using cummax
:
ix = df == "d"
df[t(apply(ix, 1, cummax)) & !ix] = NA
# ID v1 v2 v3 v4
# 1 H 0 0 d <NA>
# 2 I 0 0 0 0
# 3 J d <NA> <NA> <NA>
# 4 K 0 0 0 d
# 5 L 0 d <NA> <NA>
To increase speed, replace apply
with collapse::dapply
:
ix = df == "d"
df[collapse::dapply(ix, cummax, MARGIN = 1) & !ix] = NA
Or use matrixStats::rowCummaxs
:
ix = df == "d"
df[rowCummaxs( ix) & !ix] = NA
CodePudding user response:
Two alternative solutions:
# option 1
w <- which(df == "d", arr.ind = TRUE)
w <- w[w[,2] < ncol(df),]
reps <- ncol(df) - w[,2]
w <- w[rep(1:nrow(w), reps),]
w[,2] <- w[,2] unlist(sapply(reps, seq))
df[w] <- NA
# option 2
mc <- ncol(df) - max.col(df == "d", ties.method = "first")
mc[mc >= (ncol(df) - 1)] <- 0
rr <- rep(seq_along(mc), mc)
cc <- rep(ncol(df) - mc, mc) unlist(sapply(mc, seq)[mc > 0])
df[cbind(rr, cc)] <- NA
which both also give the desired result.
CodePudding user response:
My version for solving it is:
f1 <- function(x){
i1 <- which(x == 'd') 1
cond <- length(i1) > 0 && i1 <= length(x)
if (cond){x[i1:(length(x))] <- NA;x}else{x}
}
df[-1] <- t(apply(df[-1], 1, f1))
which gives,
# ID v1 v2 v3 v4
#1 H 0 0 d <NA>
#2 I 0 0 0 0
#3 J d <NA> <NA> <NA>
#4 K 0 0 0 d
#5 L 0 d <NA> <NA>
CodePudding user response:
Here are two base R one-liners.
1) Reduce Because this operates on entire columns at a time instead of row by row it should be particularly fast if there are many rows and not many columns.
replace(df, TRUE, Reduce(function(x, y) ifelse(x == "d", NA, y), df, acc = TRUE))
giving:
ID v1 v2 v3 v4
1 H 0 0 d <NA>
2 I 0 0 0 0
3 J d <NA> <NA> <NA>
4 K 0 0 0 d
5 L 0 d <NA> <NA>
2) read.table This assumes that the only occurrences of d
are in cells consisting of a single d
(which is the case for the example in the question).
replace(df, df!="d"&is.na(read.table(text=do.call(paste,df), comment="d", fill=NA)), NA)
giving:
ID v1 v2 v3 v4
1 H 0 0 d <NA>
2 I 0 0 0 0
3 J d <NA> <NA> <NA>
4 K 0 0 0 d
5 L 0 d <NA> <NA>
CodePudding user response:
Some alternative with data.table:
library(data.table)
setDT(df)
df[, names(df)[-1] := {x <- unlist(.SD)
if(any(x=="d")) { # if there's no "d", no need to do anything
whd <- which(x=="d")
if(whd != length(x)) { # if "d" is at the end, nothing to be done either
x[(whd 1):length(x)] <- NA
}
}
as.list(x)}, # return the line as a list so the structure is kept
.SDcols=-1, by=1:nrow(df)] # you need to do a "by row" operation
CodePudding user response:
Another version using col
and max.col
:
df[-1][col(df[-1]) > max.col(df[-1] == "d", "last")] <- NA
df
# ID v1 v2 v3 v4
#1 H 0 0 d <NA>
#2 I 0 0 0 0
#3 J d <NA> <NA> <NA>
#4 K 0 0 0 d
#5 L 0 d <NA> <NA>
CodePudding user response:
One option utilizing dplyr
and purrr
could be:
df %>%
mutate(pmap_dfr(across(-ID), ~ `[<-`(c(...), seq_along(c(...)) > which(c(...) == "d"), NA)))
ID v1 v2 v3 v4
1 H 0 0 d <NA>
2 I 0 0 0 0
3 J d <NA> <NA> <NA>
4 K 0 0 0 d
5 L 0 d <NA> <NA>
CodePudding user response:
Another dplyr
solution
df %>%
pivot_longer(cols = starts_with("v")) %>%
group_by(ID) %>%
mutate(i = as.logical(cumsum((lag(value == "d", default = FALSE))))) %>%
mutate(value = replace(value, i, NA)) %>%
select(-i) %>%
pivot_wider(id_cols = ID)
CodePudding user response:
I think your version of solving this is quite concise. I tried to come up with something from the tidyverse. Its a nested across
call which uses cur_column
inside str_replace
so despite being "tidy" I think your base version is more readable.
library(dplyr)
library(stringr)
df %>%
mutate(across(v2:v4,
~ ifelse(
rowSums(across("v1":str_replace(cur_column(), "\\d", function(y) as.numeric(y)-1)) == "d") > 0,
NA,
.x)
)
)
#> ID v1 v2 v3 v4
#> 1 H 0 0 d <NA>
#> 2 I 0 0 0 0
#> 3 J d <NA> <NA> <NA>
#> 4 K 0 0 0 d
#> 5 L 0 d <NA> <NA>
Created on 2021-09-20 by the reprex package (v2.0.1)
CodePudding user response:
base
df <- data.frame(ID = LETTERS[8:12],
v1 = c(0, 0, 'd', 0, 0),
v2 = c(0, 0, 0, 0, 'd'),
v3 = c('d', 0, 0, 0, 0),
v4 = c(0, 0, 0, 'd', 0),
stringsAsFactors = FALSE)
df[df == "d"] <- 1
l <- max.col(df[-1], ties.method = "first")
tmp <- t(sapply(l, function(x) seq(ncol(df[-1])) > x))
df[-1][tmp] <- NA
df
#> ID v1 v2 v3 v4
#> 1 H 0 0 1 <NA>
#> 2 I 0 <NA> <NA> <NA>
#> 3 J 1 <NA> <NA> <NA>
#> 4 K 0 0 0 1
#> 5 L 0 1 <NA> <NA>
Created on 2021-09-20 by the reprex package (v2.0.1)
CodePudding user response:
An approach using lagged boolean rowCumsums
.
m <- as.matrix(df)
s <- which(as.logical(matrixStats::rowCumsums(m == 'd')))
m[,-1][`length<-`(s, length(s) - nrow(m) 1)] <- NA
as.data.frame(array(m, dim(df)))
# V1 V2 V3 V4 V5
# 1 H 0 0 d <NA>
# 2 I 0 0 0 0
# 3 J d <NA> <NA> <NA>
# 4 K 0 0 0 d
# 5 L 0 d <NA> <NA>
CodePudding user response:
Another way using |
in Reduce
assuming that the first column is the ID and does not have a d
.
is.na(df[-1]) <- do.call(cbind, Reduce(`|`, lapply(df[-nrow(df)], `==`, "d")
, accumulate = TRUE))
df
# ID v1 v2 v3 v4
#1 H 0 0 d <NA>
#2 I 0 0 0 0
#3 J d <NA> <NA> <NA>
#4 K 0 0 0 d
#5 L 0 d <NA> <NA>
Or using which
and sequence
. With getting arr.ind
from which
,
i <- which(df[c(-1, -ncol(df))] == "d", arr.ind = TRUE)
n <- ncol(df) - i[,2] - 1
is.na(df) <- cbind(rep(i[,1], n), sequence(n, 2 i[,2]))
or using arrayInd
.
i <- which(df[c(-1, -ncol(df))] == "d")
n <- ncol(df) - (i-1) %/% nrow(df) - 2
is.na(df) <- arrayInd(sequence(n, i 2*nrow(df), nrow(df)), dim(df))