Home > front end >  R - apply different transformations to dataframe based on elements of list
R - apply different transformations to dataframe based on elements of list

Time:09-19

In the following data frame...

Case <- c("Case1", "Case1", "Case1", "Case2", "Case2", "Case2", "Case3", "Case3", "Case3", "Case4", "Case4", "Case4", "Case5", "Case5", "Case5")
From <- c(1920, 1946, 1951, 1912, 1962, 1973, 1933, 1960, 1986, 1940, 1946, 1952, 1926, 1938, 1964)
To <- c(1945, 1950, 1951, 1961, 1972, 1973, 1959, 1985, 1986, 1945, 1951, 1952, 1937, 1963, 1970)
A <- c(0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1)
B <- c(0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0)
C <- c(0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1)
D <- c(1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0)
Outcome <- c(0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0)
Data <- data.frame(Case, From, To, A, B, C, D, Outcome)

... I create a new column (FLC) with a series of transformations based on specified elements of a vector (conditions):

conditions <- c("A", "B", "C", "D")

all_one_by_row <- function(data, cols) {
  if(missing(cols)) as.integer(rowSums(data) == ncol(data))
  else as.integer(rowSums(data[cols]) == ncol(data[cols]))
}
new.df <- Data

tmp <- sapply(new.df[conditions], function(x) x == Lag(x, -1)) 
tmp[is.na(tmp)] <- FALSE
new.df$FLC <- 1 - all_one_by_row(tmp) 
new.df$FLC[new.df$Outcome > 0.5] <- 1
new.df$FLC[new.df$Case != Lag(new.df$Case, -1)] <- 1

new.df

I obtain:

> new.df
    Case From   To A B C D Outcome FLC
1  Case1 1920 1945 0 0 0 1       0   1
2  Case1 1946 1950 1 1 0 0       0   0
3  Case1 1951 1951 1 1 0 0       1   1
4  Case2 1912 1961 0 1 1 1       0   1
5  Case2 1962 1972 1 0 1 0       0   0
6  Case2 1973 1973 1 0 1 0       1   1
7  Case3 1933 1959 0 0 1 1       0   1
8  Case3 1960 1985 0 1 1 0       0   1
9  Case3 1986 1986 1 1 1 0       1   1
10 Case4 1940 1945 1 0 1 0       0   1
11 Case4 1946 1951 1 0 1 1       0   0
12 Case4 1952 1952 1 0 1 1       1   1
13 Case5 1926 1937 0 0 0 0       0   1
14 Case5 1938 1963 1 0 1 1       0   1
15 Case5 1964 1970 1 0 1 0       0   1

Now, I would like to carry out the same operation based on all elements of a list (combinations.list):

combinations <- unlist(lapply(seq_along(conditions), \(i) if(i > 1) combn(conditions, i, FUN = paste, collapse = "") else conditions))

combinations.list  <- strsplit(combinations, "")
combinations.list

> combinations.list
[[1]]
[1] "A"

[[2]]
[1] "B"

[[3]]
[1] "C"

[[4]]
[1] "D"

[[5]]
[1] "A" "B"

[[6]]
[1] "A" "C"

[[7]]
[1] "A" "D"

[[8]]
[1] "B" "C"

[[9]]
[1] "B" "D"

[[10]]
[1] "C" "D"

[[11]]
[1] "A" "B" "C"

[[12]]
[1] "A" "B" "D"

[[13]]
[1] "A" "C" "D"

[[14]]
[1] "B" "C" "D"

[[15]]
[1] "A" "B" "C" "D"

I.e. I would like to obtain a data frame, where a new column (FLC.x) is created for every line of this list based on all elements in the line, such that:

  CASES FROM   TO A B C D OUTCOME FLC.A FLC.B FLC.C
1  Case1 1920 1945 0 0 0 1       0     1     1     0
2  Case1 1946 1950 1 1 0 0       0     0     0     0
3  Case1 1951 1951 1 1 0 0       1     1     1     1
4  Case2 1912 1961 0 1 1 1       0     1     1     0
5  Case2 1962 1972 1 0 1 0       0     0     0     0
6  Case2 1973 1973 1 0 1 0       1     1     1     1
7  Case3 1933 1959 0 0 1 1       0     0     1     0
8  Case3 1960 1985 0 1 1 0       0     1     0     0
9  Case3 1986 1986 1 1 1 0       1     1     1     1
10 Case4 1940 1945 1 0 1 0       0     0     0     0
11 Case4 1946 1951 1 0 1 1       0     0     0     0
12 Case4 1952 1952 1 0 1 1       1     1     1     1
13 Case5 1926 1937 0 0 0 0       0     1     0     1
14 Case5 1938 1963 1 0 1 1       0     0     0     0
15 Case5 1964 1970 1 0 1 0       0     1     1     1
   FLC.D FLC.AB FLC.AC FLC.AD FLC.BC FLC.BD FLC.CD
1      1      1      1      1      1      1      1
2      0      0      0      0      0      0      0
3      1      1      1      1      1      1      1
4      1      1      1      1      1      1      1
5      0      0      0      0      0      0      0
6      1      1      1      1      1      1      1
7      1      1      0      1      1      1      1
8      0      1      1      1      0      0      0
9      1      1      1      1      1      1      1
10     1      0      0      1      0      1      1
11     0      0      0      0      0      0      0
12     1      1      1      1      1      1      1
13     1      1      1      1      1      1      1
14     1      0      0      1      0      1      1
15     1      1      1      1      1      1      1
   FLC.ABC FLC.ABD FLC.ACD FLC.BCD FLC.ABCD
1        1       1       1       1        1
2        0       0       0       0        0
3        1       1       1       1        1
4        1       1       1       1        1
5        0       0       0       0        0
6        1       1       1       1        1
7        1       1       1       1        1
8        1       1       1       0        1
9        1       1       1       1        1
10       0       1       1       1        1
11       0       0       0       0        0
12       1       1       1       1        1
13       1       1       1       1        1
14       0       1       1       1        1
15       1       1       1       1        1
> 

Could someone help me with a way to do this?

CodePudding user response:

Thanks to r2evans indications, I managed to solve it like this:

conditions <- c("A", "B", "C", "D")

combinations <- unlist(lapply(seq_along(conditions), \(i) if(i > 1) combn(conditions, i, FUN = paste, collapse = "") else conditions))
combinations

flcs <- paste("FLC.", combinations, sep="") 
flcs

combinations.list  <- strsplit(combinations, "")
combinations.list

all_one_by_row <- function(data, cols) {
  if(missing(cols)) as.integer(rowSums(data) == ncol(data))
  else as.integer(rowSums(data[cols]) == ncol(data[cols]))
}
new.df <- Data

new.df <- lapply(combinations.list, function(conditions){
  tmp <- sapply(new.df[conditions], function(x) x == Lag(x, -1)) 
  tmp[is.na(tmp)] <- FALSE
  new.df$FLC <- 1 - all_one_by_row(tmp) 
  new.df$FLC[new.df$Outcome > 0.5] <- 1
  new.df$FLC[new.df$Case != Lag(new.df$Case, -1)] <- 1
  new.df
})

new.df <- lapply(new.df, function(x) x[(names(x) %in% c("FLC"))])
new.df <- bind_cols(new.df)
colnames(new.df) <- flcs 
new.df <- cbind(Data, new.df)
new.df
  • Related