Return all combinations of two column types that sum to >=0 and return corresponding summary meta-CodePudding

I have data like this:

example_df <- data.frame(
  col1type1 =c(110:106),
  col2type2 = c(-108:-104),
  col3type1 = c(-109:-105), 
  col4type2 =c(110:106),
  col5type1 =c(107:103),
  col6type2 = c(-110:-106),
  col7type1 =c(109:113),
  col8type2 = c(-120:-116),
  col9type1 = c(-105:-101),
  col10type2 =c(105:101),
  col11type1 = c(-125:-121),
  col12type2 = c(-105:-101) 
)

I want to return only combinations where type1 type2>=0 on the same row and return to a new df the combination where it was >=0, the row, and the two numbers: (I know I could use for/foreach to calculate each cell individually and output to a data.frame, but there has to be a more efficient way)

Desired output like this (incomplete):

#for all possible combinations, like the example rows below
example_first <- data.frame(column_combination="col1type1_col2type2", row=1, sum=2,col1number=110,col2number=-108)
example_mid<- data.frame(column_combination="col1type1_col12type2",row=3, sum=5,col1number=108,col2number=-103)
example_last <- data.frame(column_combination="col9type1_col10type2",row=5,sum=0,col1number=-101,col2number=101)

#would want like this for all possible combinations
desired_incomplete_output <- rbind(example_first,example_mid,example_last)

What is an efficient way to calculate this en masse rather than a brutal for/foreach loop? Thanks!

CodePudding user response：

If the desired complete output consists of 79 results, for the given example, you may do something like this.

Explanation of steps-

Through first 2 lines i.e. mutate and split we have split the data into separate row each of its own dataframe, i.e. into a list.
To work with this list I have used purrr::imap_dfr which basically takes a list as input and outputs a data.frame after row binding all results. In each of its sub-step, I have done-
- First deselecting row column
- the pivoting everything
- then separating name column which has all your column names of input data into two separate columns using tidyr::separate
- then creating a cross product of num1 and num2 combinations using purrr::cross2
- then using map_dfr again to convert that cross product into a data frame
- then using separator to separate column names and values. I used a seapartor @ which I assumed that it is nowhere used in column names
- thereafter filtering the rows
- other basic data wrangling/transformation using dplyr verbs

library(tidyverse)

example_df %>% 
  mutate(row = row_number()) %>% 
  split(.$row) %>% 
  imap_dfr(\(.a, .b) .a %>% 
        select(-row) %>% 
        pivot_longer(everything()) %>% 
        separate(name, into = c('col', 'type'), sep = '(?:type)') %>% 
        {cross2(paste(.$col[.$type == '1'], .$value[.$type == '1'], sep = "@"), 
                paste(.$col[.$type == '2'], .$value[.$type == '2'], sep = "@"))} %>% 
        map_dfr(~ set_names(.x, c('x', 'y'))) %>% 
        separate(x, into = c('col1', 'type1'), convert = TRUE, sep = '@') %>% 
        separate(y, into = c('col2', 'type2'), convert = TRUE, sep = "@") %>% 
        filter(type1   type2 >= 0) %>% 
        mutate(col_comb = paste0(col1, 'type1_', col2, "type2"),
               sum= type1   type2) %>% 
        rename(col1number = type1,
               col2number = type2) %>% 
        select(-col1, -col2) %>% 
        mutate(row = .b))
#> # A tibble: 79 × 5
#>    col1number col2number col_comb               sum row  
#>         <int>      <int> <chr>                <int> <chr>
#>  1        110       -108 col1type1_col2type2      2 1    
#>  2        109       -108 col7type1_col2type2      1 1    
#>  3        110        110 col1type1_col4type2    220 1    
#>  4       -109        110 col3type1_col4type2      1 1    
#>  5        107        110 col5type1_col4type2    217 1    
#>  6        109        110 col7type1_col4type2    219 1    
#>  7       -105        110 col9type1_col4type2      5 1    
#>  8        110       -110 col1type1_col6type2      0 1    
#>  9        110        105 col1type1_col10type2   215 1    
#> 10        107        105 col5type1_col10type2   212 1    
#> # … with 69 more rows

In case your columns are named as anum1, anum2, bnum1..., we can modify this a bit (3 steps actually, all marked as comments)

example_df %>% 
  mutate(row = row_number()) %>% 
  split(.$row) %>% 
  imap_dfr(\(.a, .b) .a %>% 
             select(-row) %>% 
             pivot_longer(everything()) %>% 
             separate(name, into = c('col', 'type'), sep = '(?:num)') %>% # change sep
             {cross2(paste(.$col[.$type == '1'], .$value[.$type == '1'], sep = "@"), 
                     paste(.$col[.$type == '2'], .$value[.$type == '2'], sep = "@"))} %>% 
             map_dfr(~ set_names(.x, c('x', 'y'))) %>% 
             separate(x, into = c('col1', 'type1'), convert = TRUE, sep = '@') %>% 
             separate(y, into = c('col2', 'type2'), convert = TRUE, sep = "@") %>% 
             filter(type1   type2 >= 0) %>% 
             mutate(col_comb = paste0(col1, 'type1_', col2, "type2"),
                    sum= type1   type2) %>% 
             rename(col1number = num1,      # change prefix
                    col2number = num2) %>%  # change prefix
             select(-col1, -col2) %>% 
             mutate(row = .b))

CodePudding user response：

You can pivot_longer, excepting each column in iteration. Then mutate to create sum, column names, rows, and filter for non-negatives

library(dplyr)
library(tidyr)
library(purrr)
map_dfr(1:length(example_df), function(i){
example_df %>% 
    # Get row number
    tibble::rownames_to_column(., var = "row") %>% 
    # Excepting the rowname column and iterating through each column (except row)
    pivot_longer(-c(row, (i   1))) %>% 
    # Get col1number value
    mutate(col1number = .[[2]]) %>% 
    rowwise() %>% 
    # Get the column names and paste together for combination
    mutate(column1 = colnames(example_df)[i],
           column_combination = paste(column1, name, sep = "_"),
           # These are the value columns
           sum = sum(across(c(2, 4))),
           # col2 name
           col2number = value) %>% 
    filter(sum >= 0) %>% 
    select(column_combination, row, sum, col1number, col2number)
}) %>% 
  bind_rows %
  ungroup()

Gives:

# A tibble: 288 × 5
   column_combination   row     sum col1number col2number
   <chr>                <chr> <int>      <int>      <int>
 1 col1type1_col2type2  1         2        110       -108
 2 col1type1_col3type1  1         1        110       -109
 3 col1type1_col4type2  1       220        110        110
 4 col1type1_col5type1  1       217        110        107
 5 col1type1_col6type2  1         0        110       -110
 6 col1type1_col7type1  1       219        110        109
 7 col1type1_col9type1  1         5        110       -105
 8 col1type1_col10type2 1       215        110        105
 9 col1type1_col12type2 1         5        110       -105
10 col1type1_col2type2  2         2        109       -107
# … with 278 more rows

CodePudding user response：

A matrix approach:

m <- as.matrix(example_df)

type1 <- seq(1,ncol(m),by=2)
type2 <- seq(2,ncol(m),by=2)

cbn <- expand.grid(type1=type1,type2=type2)

res.selection <- as.vector((m[,cbn$type1] m[,cbn$type2])>0)

res.row <- rep(1:nrow(m),nrow(cbn))
res.type1number <- as.vector(m[,cbn$type1])
res.type2number <- as.vector(m[,cbn$type2])
res.sum <- as.vector(m[,cbn$type1] m[,cbn$type2])
res.type1 <- rep(cbn$type1,each=nrow(m))
res.type2 <- rep(cbn$type2,each=nrow(m))    

data.frame( combination = paste0('col',res.type1[res.selection],'type1-col',res.type2[res.selection],'type2'), 
            row = res.row[res.selection],
            type1number = res.type1number[res.selection], 
            type2number = res.type2number[res.selection],
            sum = res.sum[res.selection])

#>             combination row type1number type2number sum
#> 1   col1type1-col2type2   1         110        -108   2
#> 2   col1type1-col2type2   2         109        -107   2
#> 3   col1type1-col2type2   3         108        -106   2
#> 4   col1type1-col2type2   4         107        -105   2
#> 5   col1type1-col2type2   5         106        -104   2
#> 6   col7type1-col2type2   1         109        -108   1
#> 7   col7type1-col2type2   2         110        -107   3
#> 8   col7type1-col2type2   3         111        -106   5
#> 9   col7type1-col2type2   4         112        -105   7
#> 10  col7type1-col2type2   5         113        -104   9
#> 11  col1type1-col4type2   1         110         110 220
#> 12  col1type1-col4type2   2         109         109 218
#> 13  col1type1-col4type2   3         108         108 216
#> 14  col1type1-col4type2   4         107         107 214
#> 15  col1type1-col4type2   5         106         106 212
#> 16  col3type1-col4type2   1        -109         110   1
#> 17  col3type1-col4type2   2        -108         109   1
#> 18  col3type1-col4type2   3        -107         108   1
#> 19  col3type1-col4type2   4        -106         107   1
#> 20  col3type1-col4type2   5        -105         106   1
#> 21  col5type1-col4type2   1         107         110 217
#> 22  col5type1-col4type2   2         106         109 215
#> 23  col5type1-col4type2   3         105         108 213
#> 24  col5type1-col4type2   4         104         107 211
#> 25  col5type1-col4type2   5         103         106 209
#> 26  col7type1-col4type2   1         109         110 219
#> 27  col7type1-col4type2   2         110         109 219
#> 28  col7type1-col4type2   3         111         108 219
#> 29  col7type1-col4type2   4         112         107 219
#> 30  col7type1-col4type2   5         113         106 219
#> 31  col9type1-col4type2   1        -105         110   5
#> 32  col9type1-col4type2   2        -104         109   5
#> 33  col9type1-col4type2   3        -103         108   5
#> 34  col9type1-col4type2   4        -102         107   5
#> 35  col9type1-col4type2   5        -101         106   5
#> 36  col7type1-col6type2   2         110        -109   1
#> 37  col7type1-col6type2   3         111        -108   3
#> 38  col7type1-col6type2   4         112        -107   5
#> 39  col7type1-col6type2   5         113        -106   7
#> 40 col1type1-col10type2   1         110         105 215
#> 41 col1type1-col10type2   2         109         104 213
#> 42 col1type1-col10type2   3         108         103 211
#> 43 col1type1-col10type2   4         107         102 209
#> 44 col1type1-col10type2   5         106         101 207
#> 45 col5type1-col10type2   1         107         105 212
#> 46 col5type1-col10type2   2         106         104 210
#> 47 col5type1-col10type2   3         105         103 208
#> 48 col5type1-col10type2   4         104         102 206
#> 49 col5type1-col10type2   5         103         101 204
#> 50 col7type1-col10type2   1         109         105 214
#> 51 col7type1-col10type2   2         110         104 214
#> 52 col7type1-col10type2   3         111         103 214
#> 53 col7type1-col10type2   4         112         102 214
#> 54 col7type1-col10type2   5         113         101 214
#> 55 col1type1-col12type2   1         110        -105   5
#> 56 col1type1-col12type2   2         109        -104   5
#> 57 col1type1-col12type2   3         108        -103   5
#> 58 col1type1-col12type2   4         107        -102   5
#> 59 col1type1-col12type2   5         106        -101   5
#> 60 col5type1-col12type2   1         107        -105   2
#> 61 col5type1-col12type2   2         106        -104   2
#> 62 col5type1-col12type2   3         105        -103   2
#> 63 col5type1-col12type2   4         104        -102   2
#> 64 col5type1-col12type2   5         103        -101   2
#> 65 col7type1-col12type2   1         109        -105   4
#> 66 col7type1-col12type2   2         110        -104   6
#> 67 col7type1-col12type2   3         111        -103   8
#> 68 col7type1-col12type2   4         112        -102  10
#> 69 col7type1-col12type2   5         113        -101  12