Home > Back-end >  Separate positive values into multiple rows on multiple columns
Separate positive values into multiple rows on multiple columns

Time:07-15

Suppose I have a data set like this:

dat <- tibble(id = 1:4, 
              col1 = c(0, 1, 1, 0),
              col2 = c(1, 0, 1, 0),
              col3 = c(1, 1, 0, 1))

> dat
# A tibble: 4 × 4
     id  col1  col2  col3
  <int> <dbl> <dbl> <dbl>
1     1     0     1     1
2     2     1     0     1
3     3     1     1     0
4     4     0     0     1

I'd like to separate, for every unique id, the multiple 1s into multiple rows, i.e. the expected output is:

# A tibble: 7 × 4
     id  col1  col2  col3
  <dbl> <dbl> <dbl> <dbl>
1     1     0     1     0
2     1     0     0     1
3     2     1     0     0
4     2     0     0     1
5     3     1     0     0
6     3     0     1     0
7     4     0     0     1

For the first id (id = 1), col2 and col3 are both 1, so I would like a separate row for each of them. It kinda is like one-hot encoding for rows.

CodePudding user response:

With help from Ritchie Sacramento and RobertoT

library(tidyverse)

dat <- tibble(id = 1:4, 
              col1 = c(0, 1, 1, 0),
              col2 = c(1, 0, 1, 0),
              col3 = c(1, 1, 0, 1))

dat %>%  
  pivot_longer(-id) %>% 
  filter(value != 0) %>% 
  mutate(rows = 1:nrow(.)) %>% 
  pivot_wider(values_fill = 0, 
              names_sort = TRUE) %>% 
  select(-rows)

# A tibble: 7 × 4
     id  col1  col2  col3
  <int> <dbl> <dbl> <dbl>
1     1     0     1     0
2     1     0     0     1
3     2     1     0     0
4     2     0     0     1
5     3     1     0     0
6     3     0     1     0
7     4     0     0     1

CodePudding user response:

Here is an alternative approach using model.matrix():

From the documenation: model.matrix creates a design (or model) matrix, e.g., by expanding factors to a set of dummy variables (depending on the contrasts) and expanding interactions similarly.

library(dplyr)
library(tidyr)

dat %>% 
  pivot_longer(-id) %>% 
  filter(value == 1) %>% 
  cbind((model.matrix(~ name   0, .) == 1)*1)
  id name value namecol1 namecol2 namecol3
1  1 col2     1        0        1        0
2  1 col3     1        0        0        1
3  2 col1     1        1        0        0
4  2 col3     1        0        0        1
5  3 col1     1        1        0        0
6  3 col2     1        0        1        0
7  4 col3     1        0        0        1

CodePudding user response:

You could do

arrange(bind_rows(lapply(2:4, function(x) {
  d <- dat[dat[[x]] == 1,]
  d[-c(1, x)] <- 0
  d})), id)
#> # A tibble: 7 x 4
#>      id  col1  col2  col3
#>   <int> <dbl> <dbl> <dbl>
#> 1     1     0     1     0
#> 2     1     0     0     1
#> 3     2     1     0     0
#> 4     2     0     0     1
#> 5     3     1     0     0
#> 6     3     0     1     0
#> 7     4     0     0     1

Created on 2022-07-14 by the reprex package (v2.0.1)

CodePudding user response:

Using explicit loops:

nullrow <- rep(0, ncol(dat)-1)
data <- dat[,-1]
rowsums <- apply(data, 1, sum)
res <- data[0,]
ids <- c()
for(i in 1:nrow(data)) {
  if(rowsums[i]>0) {
    for(j in 1:rowsums[i]) {
      thisrow <- nullrow
      thiscolumn <- which(data[i,]==1)[j]
      thisrow[thiscolumn] <- 1
      res <- rbind(res, thisrow)
    }
    ids <- c(ids, rep(dat$id[i], rowsums[i]))
  }  
}
names(res) <- colnames(data)
res$id <- ids
> res
  col1 col2 col3 id
1    0    1    0  1
2    0    0    1  1
3    1    0    0  2
4    0    0    1  2
5    1    0    0  3
6    0    1    0  3
7    0    0    1  4

CodePudding user response:

A possible solution, based on Matrix::sparseMatrix:

  1. First, it gets the indexes where there are 1 (with which).
  2. Second, it adjusts the row indexes, to force one 1 per row.
  3. Third, it creates a sparse matrix, putting the 1 where the adjusted indexes specify.
library(tidyverse)
library(Matrix)

which(dat[-1] == 1, arr.ind = T) %>% 
  as.data.frame %>% 
  arrange(row) %>% 
  mutate(id = dat[row,"id"], row = 1:n()) %>% 
  {data.frame(id = .$id, as.matrix( sparseMatrix(i = .$row, j= .$col, x= 1)))}

#>   id X1 X2 X3
#> 1  1  0  1  0
#> 2  1  0  0  1
#> 3  2  1  0  0
#> 4  2  0  0  1
#> 5  3  1  0  0
#> 6  3  0  1  0
#> 7  4  0  0  1

Another possible solution:

library(tidyverse)

f <- function(df)
{
  got <- 0
  
  for (i in 1:nrow(df))
  {
    got <- which.max(df[i, (got 1):ncol(df)])   got
    df[i, -got] <- 0
  }
  
  df  
}

dat %>% 
  slice(map(1:nrow(dat), ~ rep(.x, rowSums(dat[-1])[.x])) %>% unlist) %>% 
  group_by(id) %>% 
  group_modify(~ f(.)) %>% 
  ungroup

#> # A tibble: 7 × 4
#>      id  col1  col2  col3
#>   <int> <dbl> <dbl> <dbl>
#> 1     1     0     1     0
#> 2     1     0     0     1
#> 3     2     1     0     0
#> 4     2     0     0     1
#> 5     3     1     0     0
#> 6     3     0     1     0
#> 7     4     0     0     1
  • Related