I have one column (a) with only two types of values: NULL and TRUE. I want to create 'a_deduplicated' column out of 'a', which is taking the first TRUE value and make the rest NULL. I do not have any other columns to group by it.
Could you please help me to write a code in R to get the 'a_deduplicated' column?
a | a_deduplicated |
---|---|
NA | NA |
NA | NA |
NA | NA |
TRUE | TRUE |
TRUE | NA |
NA | NA |
TRUE | TRUE |
TRUE | NA |
TRUE | NA |
NA | NA |
NA | NA |
TRUE | TRUE |
TRUE | NA |
TRUE | NA |
TRUE | NA |
df to run in R:
df <- structure(list(a = c(NA, NA, NA, TRUE, TRUE, NA, TRUE, TRUE, TRUE, NA, NA, TRUE, TRUE, TRUE, TRUE),
a_deduplicated = c(NA, NA, NA, TRUE, NA, NA, TRUE, NA, NA, NA, NA, TRUE, NA, NA, NA)), class = c("tbl_df","tbl", "data.frame"),
row.names = c(NA, -15L))
CodePudding user response:
Similar to Mael's answer, but using the lag function
require(tidyverse)
df <- df %>%
mutate(
new = ifelse(a & is.na(lag(a)), TRUE, NA)
)
df
#> # A tibble: 15 × 3
#> a a_deduplicated new
#> <lgl> <lgl> <lgl>
#> 1 NA NA NA
#> 2 NA NA NA
#> 3 NA NA NA
#> 4 TRUE TRUE TRUE
#> 5 TRUE NA NA
#> 6 NA NA NA
#> 7 TRUE TRUE TRUE
#> 8 TRUE NA NA
#> 9 TRUE NA NA
#> 10 NA NA NA
#> 11 NA NA NA
#> 12 TRUE TRUE TRUE
#> 13 TRUE NA NA
#> 14 TRUE NA NA
#> 15 TRUE NA NA
require(waldo)
#> Lade nötiges Paket: waldo
waldo::compare(df$a_deduplicated, df$new)
#> ✔ No differences
CodePudding user response:
You can do:
transform(df, a_new = ifelse(a & is.na(diff(c(0, a))), T, NA))
a a_deduplicated a_new
1 NA NA NA
2 NA NA NA
3 NA NA NA
4 TRUE TRUE TRUE
5 TRUE NA NA
6 NA NA NA
7 TRUE TRUE TRUE
8 TRUE NA NA
9 TRUE NA NA
10 NA NA NA
11 NA NA NA
12 TRUE TRUE TRUE
13 TRUE NA NA
14 TRUE NA NA
15 TRUE NA NA
CodePudding user response:
df <- structure(list(a = c(NA, NA, NA, TRUE, TRUE, NA, TRUE, TRUE, TRUE, NA, NA, TRUE, TRUE, TRUE, TRUE),
a_deduplicated = c(NA, NA, NA, TRUE, NA, NA, TRUE, NA, NA, NA, NA, TRUE, NA, NA, NA)), class = c("tbl_df","tbl", "data.frame"),
row.names = c(NA, -15L))
library(tidyverse)
df %>%
group_by(grp = data.table::rleid(a)) %>%
mutate(res = if_else(row_number() == 1, a, NA), .keep = c("unused")) %>%
select(-grp) %>%
ungroup()
#> # A tibble: 15 x 3
#> grp a_deduplicated res
#> <int> <lgl> <lgl>
#> 1 1 NA NA
#> 2 1 NA NA
#> 3 1 NA NA
#> 4 2 TRUE TRUE
#> 5 2 NA NA
#> 6 3 NA NA
#> 7 4 TRUE TRUE
#> 8 4 NA NA
#> 9 4 NA NA
#> 10 5 NA NA
#> 11 5 NA NA
#> 12 6 TRUE TRUE
#> 13 6 NA NA
#> 14 6 NA NA
#> 15 6 NA NA
library(data.table)
setDT(df)[, res := ifelse(seq(.N) == 1, a, NA), by = rleid(a)][]
#> a a_deduplicated res
#> 1: NA NA NA
#> 2: NA NA NA
#> 3: NA NA NA
#> 4: TRUE TRUE TRUE
#> 5: TRUE NA NA
#> 6: NA NA NA
#> 7: TRUE TRUE TRUE
#> 8: TRUE NA NA
#> 9: TRUE NA NA
#> 10: NA NA NA
#> 11: NA NA NA
#> 12: TRUE TRUE TRUE
#> 13: TRUE NA NA
#> 14: TRUE NA NA
#> 15: TRUE NA NA
Created on 2022-08-15 by the reprex package (v2.0.1)
CodePudding user response:
Solution using data.table
:
setDT(df)
df[, flag:=shift(a,1,type="lag")]
df[, dedup:=ifelse(is.na(flag) & a == TRUE,a,NA)]
df
a a_deduplicated flag dedup
1: NA NA NA NA
2: NA NA NA NA
3: NA NA NA NA
4: TRUE TRUE NA TRUE
5: TRUE NA TRUE NA
6: NA NA TRUE NA
7: TRUE TRUE NA TRUE
8: TRUE NA TRUE NA
9: TRUE NA TRUE NA
10: NA NA TRUE NA
11: NA NA NA NA
12: TRUE TRUE NA TRUE
13: TRUE NA TRUE NA
14: TRUE NA TRUE NA
15: TRUE NA TRUE NA