looping logistic regression row wise in R-CodePudding

I'm trying to run logistic regression analyses for each row from R1 to R6 against C1 row of dataframe "dat"

Input data frame

dat <- data.frame(list(X1 = c(1, 1, 0, 1, 1, 1, 0), X2 = c(1, 1, 0, 1, 1, 1, 0), X3 = c(1, 1, 1, 0, 1, 1, 0), X4 = c(1, 1, 0, 1, 1, 1, 0), X5 = c(0, 1, 0, 0, 1, 1, 0), X6 = c(1, 1, 1, 0, 1, 1, 0), X7 = c(0, 1, 0, 1, 1, 1, 0), X8 = c(0, 1, 1, 0, 1, 0, 0), X9 = c(0, 1, 1, 0, 1, 0, 0), X10 = c(1, 1, 1, 0, 1, 0, 0), X11 = c(1, 1, 1, 1, 1, 0, 0), X12 = c(0, 1, 0, 0, 1, 0, 0 ), X13 = c(0, 1, 0, 1, 1, 0, 0), X14 = c(1, 1, 0, 1, 1, 0, 0), X15 = c(1, 1, 1, 1, 1, 0, 0), X16 = c(0, 1, 1, 1, 1, 0, 0), X17 = c(0, 1, 1, 0, 1, 0, 0), X18 = c(0, 1, 1, 0, 1, 0, 0), X19 = c(1, 1, 0, 0, 1, 0, 0), X20 = c(1, 1, 1, 0, 1, 0, 0), X21 = c(0, 1, 0, 1, 1, 0, 0), X22 = c(1, 1, 0, 0, 1, 0, 0), X23 = c(1, 1, 1, 0, 1, 0, 0), X24 = c(1, 1, 0, 0, 1, 0, 0), X25 = c(1, 1, 0, 0, 1, 0, 0), X26 = c(0, 1, 1, 1, 1, 0, 0), X27 = c(0, 1, 0, 0, 1, 0, 0), X28 = c(1, 1, 0, 0, 1, 0, 0), X29 = c(1, 1, 1, 1, 0, 0, 0), X30 = c(1, 1, 0, 0, 0, 1, 0)),  row.names = c("r1", "r2", "r3", "r4", "r5", "r6", "C1"))

logistic regression analyses

    R1 <- glm(r1 ~ C1, data=dat, family=binomial); coef(summary(R1))[,2] 
    R2 <- glm(r2 ~ C1, data=dat, family=binomial); coef(summary(R2))[,2] 
    R3 <- glm(r3 ~ C1, data=dat, family=binomial); coef(summary(R3))[,2] 
    R4 <- glm(r4 ~ C1, data=dat, family=binomial); coef(summary(R4))[,2] 
    R5 <- glm(r5 ~ C1, data=dat, family=binomial); coef(summary(R5))[,2] 
    R6 <- glm(r6 ~ C1, data=dat, family=binomial); coef(summary(R6))[,2]

The real data have 6000 row so it is not possible to do one by one row against C1.

Is there is a way to do it in loop where glm is calculated for each row from R1 to R6 against C1 and extract the output into a new column?

CodePudding user response：

I think this is what you are looking for.

library(tidyverse)

dat %>%
  rownames_to_column("var") %>%
  pivot_longer(cols = -var) %>%
  mutate(grp = ifelse(grepl("r", var), "x", "y")) %>%
  group_split(grp) %>%
  reduce(full_join, by = "name") %>%
  nest(data = -var.x) %>%
  mutate(reg = map_dbl(data, ~glm(value.x ~ value.y, data=.x, family=binomial) %>%
                     summary() %>%
                     coef() %>%
                     .[,2])) %>%
  select(-data)
#> # A tibble: 6 x 2
#>   var.x       reg
#>   <chr>     <dbl>
#> 1 r1        0.373
#> 2 r2    39436.   
#> 3 r3        0.366
#> 4 r4        0.373
#> 5 r5        0.732
#> 6 r6        0.413

First we make your row names into a variable name with rownames_to_column, then we take the dataframe from wide to long using pivot_longer. Then we split the dataframe based on your "x" variable and your "y" variable for your logistic regression. I then join the split dataframe with itself to get every combination of x and y. Lastly, I nest the dataframe and run all the regressions for each "row".

Let me know if anything is unclear.

CodePudding user response：

library(tidyverse)

independent_var <- 'C1'
dependent_vars <- setdiff(rownames(dat),independent_var)
compare <- NULL

for(i in dependent_vars){
    modelname <- toupper(i)
    
    filtered_data <- dat %>%
    t %>% 
    data.frame %>% 
    select(all_of(c(i,'C1')))
    
    eval(parse(text=sprintf('%s <- glm(%s ~ %s,data=filtered_data,family=binomial)',modelname,i,independent_var)))
    eval(parse(text=sprintf('newrow <- data.frame(model="%s",coef=as.numeric(%s$coefficients[-2]))',modelname,modelname)))
    compare <- rbind(compare,newrow)
}

compare

output;

  model   coef
  <chr>  <dbl>
1 R1     0.405
2 R2    25.6  
3 R3    -0.134
4 R4    -0.405
5 R5     2.64 
6 R6    -1.01

CodePudding user response：

This can be done for any number of lines. First, however, a small note. In your data, row C1 contained all the values 0. This, of course, was not suitable for the glm function. So I had to edit the C1 line a bit.

library(tidyverse)

dat <- data.frame(list(X1 = c(1, 1, 0, 1, 1, 1, 1), X2 = c(1, 1, 0, 1, 1, 1, 1), X3 = c(1, 1, 1, 0, 1, 1, 1), X4 = c(1, 1, 0, 1, 1, 1, 0), X5 = c(0, 1, 0, 0, 1, 1, 0), X6 = c(1, 1, 1, 0, 1, 1, 0), X7 = c(0, 1, 0, 1, 1, 1, 0), X8 = c(0, 1, 1, 0, 1, 0, 0), X9 = c(0, 1, 1, 0, 1, 0, 0), X10 = c(1, 1, 1, 0, 1, 0, 0), X11 = c(1, 1, 1, 1, 1, 0, 0), X12 = c(0, 1, 0, 0, 1, 0, 0 ), X13 = c(0, 1, 0, 1, 1, 0, 1), X14 = c(1, 1, 0, 1, 1, 0, 1), X15 = c(1, 1, 1, 1, 1, 0, 0), X16 = c(0, 1, 1, 1, 1, 0, 0), X17 = c(0, 1, 1, 0, 1, 0, 0), X18 = c(0, 1, 1, 0, 1, 0, 0), X19 = c(1, 1, 0, 0, 1, 0, 0), X20 = c(1, 1, 1, 0, 1, 0, 0), X21 = c(0, 1, 0, 1, 1, 0, 0), X22 = c(1, 1, 0, 0, 1, 0, 0), X23 = c(1, 1, 1, 0, 1, 0, 0), X24 = c(1, 1, 0, 0, 1, 0, 0), X25 = c(1, 1, 0, 0, 1, 0, 0), X26 = c(0, 1, 1, 1, 1, 0, 0), X27 = c(0, 1, 0, 0, 1, 0, 0), X28 = c(1, 1, 0, 0, 1, 0, 0), X29 = c(1, 1, 1, 1, 0, 0, 0), X30 = c(1, 1, 0, 0, 0, 1, 0)),  row.names = c("r1", "r2", "r3", "r4", "r5", "r6", "C1"))

output

   X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30
r1  1  1  1  1  0  1  0  0  0   1   1   0   0   1   1   0   0   0   1   1   0   1   1   1   1   0   0   1   1   1
r2  1  1  1  1  1  1  1  1  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
r3  0  0  1  0  0  1  0  1  1   1   1   0   0   0   1   1   1   1   0   1   0   0   1   0   0   1   0   0   1   0
r4  1  1  0  1  0  0  1  0  0   0   1   0   1   1   1   1   0   0   0   0   1   0   0   0   0   1   0   0   1   0
r5  1  1  1  1  1  1  1  1  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   0   0
r6  1  1  1  1  1  1  1  0  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
C1  1  1  1  0  0  0  0  0  0   0   0   0   1   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0

Then I split your data into rows r and row C1.

dfR = dat %>% as_tibble() %>% slice_head(n=nrow(.)-1) 
dfC = dat %>% as_tibble() %>% slice_tail()

output dfR

# A tibble: 6 x 30
     X1    X2    X3    X4    X5    X6    X7    X8    X9   X10   X11   X12   X13   X14   X15   X16   X17   X18   X19   X20   X21
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1     1     1     1     1     0     1     0     0     0     1     1     0     0     1     1     0     0     0     1     1     0
2     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1
3     0     0     1     0     0     1     0     1     1     1     1     0     0     0     1     1     1     1     0     1     0
4     1     1     0     1     0     0     1     0     0     0     1     0     1     1     1     1     0     0     0     0     1
5     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1
6     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0
# ... with 9 more variables: X22 <dbl>, X23 <dbl>, X24 <dbl>, X25 <dbl>, X26 <dbl>, X27 <dbl>, X28 <dbl>, X29 <dbl>, X30 <dbl>

output dfC

# A tibble: 1 x 30
     X1    X2    X3    X4    X5    X6    X7    X8    X9   X10   X11   X12   X13   X14   X15   X16   X17   X18   X19   X20   X21
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1     1     1     1     0     0     0     0     0     0     0     0     0     1     1     0     0     0     0     0     0     0
# ... with 9 more variables: X22 <dbl>, X23 <dbl>, X24 <dbl>, X25 <dbl>, X26 <dbl>, X27 <dbl>, X28 <dbl>, X29 <dbl>, X30 <dbl>

Next, I have prepared four simple functions. fGetRowValue returns a vector with the values of one row. fGetData creates a tibble with two variables x and C where x is the value of a single row and C is of course the values from your C1 row. fglm returns my model and fCoef returns tibble containing the coefficients from the model.

fGetRowValue = function(data) data %>% 
  pivot_longer(everything()) %>% pull(value)

fGetData = function(dataR, dataC) tibble(
  x = fGetRowValue(dataR),
  C = fGetRowValue(dataC)
)

fglm = function(data) glm(x~C, binomial, data)

fCoef = function(model) tibble(
  Intercept = coef(model)[1],
  C = coef(model)[2]
)

Now is the time for the proper calculations.

dfR %>%  mutate(row = 1:nrow(.)) %>% 
  group_by(row) %>% 
  nest() %>%   #1
  group_modify(~fGetData(.x$data[[1]], dfC)) %>% #2
  nest() %>% #3
  mutate(model = map(data, ~fglm(.x))) %>% #4
  mutate(Ceof = map(model, ~fCoef(.x))) %>% #5
  unnest(Ceof)

output

# A tibble: 6 x 5
# Groups:   row [6]
    row data              model  Intercept        C
  <int> <list>            <list>     <dbl>    <dbl>
1     1 <tibble [30 x 2]> <glm>     0.241   1.15e 0
2     2 <tibble [30 x 2]> <glm>    25.6    -1.98e-9
3     3 <tibble [30 x 2]> <glm>     0.0800 -1.47e 0
4     4 <tibble [30 x 2]> <glm>    -0.754   2.14e 0
5     5 <tibble [30 x 2]> <glm>     2.44    1.71e 1
6     6 <tibble [30 x 2]> <glm>    -1.39    1.79e 0

As the way I do my calculations can be a little confusing let me describe it year by year (see comment numbers).

After step 1, we receive the data in this form

# A tibble: 6 x 2
# Groups:   row [6]
    row data             
  <int> <list>           
1     1 <tibble [1 x 30]>
2     2 <tibble [1 x 30]>
3     3 <tibble [1 x 30]>
4     4 <tibble [1 x 30]>
5     5 <tibble [1 x 30]>
6     6 <tibble [1 x 30]>

Variables data holds a tibble list with values on one row.

After step two, we have it as it is

# A tibble: 180 x 3
# Groups:   row [6]
     row     x     C
   <int> <dbl> <dbl>
 1     1     1     1
 2     1     1     1
 3     1     1     1
 4     1     1     0
 5     1     0     0
 6     1     1     0
 7     1     0     0
 8     1     0     0
 9     1     0     0
10     1     1     0
# ... with 170 more rows

We roll it back up in step three

# A tibble: 6 x 2
# Groups:   row [6]
    row data             
  <int> <list>           
1     1 <tibble [30 x 2]>
2     2 <tibble [30 x 2]>
3     3 <tibble [30 x 2]>
4     4 <tibble [30 x 2]>
5     5 <tibble [30 x 2]>
6     6 <tibble [30 x 2]>

Then in step four we complete our tibble with a variable containing the list of glm models.

# A tibble: 6 x 3
# Groups:   row [6]
    row data              model 
  <int> <list>            <list>
1     1 <tibble [30 x 2]> <glm> 
2     2 <tibble [30 x 2]> <glm> 
3     3 <tibble [30 x 2]> <glm> 
4     4 <tibble [30 x 2]> <glm> 
5     5 <tibble [30 x 2]> <glm> 
6     6 <tibble [30 x 2]> <glm>

We have already taken the last step, which is to download the coefficients from our models.

# A tibble: 6 x 4
# Groups:   row [6]
    row data              model  Ceof            
  <int> <list>            <list> <list>          
1     1 <tibble [30 x 2]> <glm>  <tibble [1 x 2]>
2     2 <tibble [30 x 2]> <glm>  <tibble [1 x 2]>
3     3 <tibble [30 x 2]> <glm>  <tibble [1 x 2]>
4     4 <tibble [30 x 2]> <glm>  <tibble [1 x 2]>
5     5 <tibble [30 x 2]> <glm>  <tibble [1 x 2]>
6     6 <tibble [30 x 2]> <glm>  <tibble [1 x 2]>

Now all you need to do is unnest and we have the result you want.

# A tibble: 6 x 5
# Groups:   row [6]
    row data              model  Intercept        C
  <int> <list>            <list>     <dbl>    <dbl>
1     1 <tibble [30 x 2]> <glm>     0.241   1.15e 0
2     2 <tibble [30 x 2]> <glm>    25.6    -1.98e-9
3     3 <tibble [30 x 2]> <glm>     0.0800 -1.47e 0
4     4 <tibble [30 x 2]> <glm>    -0.754   2.14e 0
5     5 <tibble [30 x 2]> <glm>     2.44    1.71e 1
6     6 <tibble [30 x 2]> <glm>    -1.39    1.79e 0

Note that by the way, all the models and the data used for their construction have been preserved. You can use it freely for further calculations.