I'm trying to run logistic regression analyses for each row from R1 to R6 against C1 row of dataframe "dat"
Input data frame
dat <- data.frame(list(X1 = c(1, 1, 0, 1, 1, 1, 0), X2 = c(1, 1, 0, 1, 1, 1, 0), X3 = c(1, 1, 1, 0, 1, 1, 0), X4 = c(1, 1, 0, 1, 1, 1, 0), X5 = c(0, 1, 0, 0, 1, 1, 0), X6 = c(1, 1, 1, 0, 1, 1, 0), X7 = c(0, 1, 0, 1, 1, 1, 0), X8 = c(0, 1, 1, 0, 1, 0, 0), X9 = c(0, 1, 1, 0, 1, 0, 0), X10 = c(1, 1, 1, 0, 1, 0, 0), X11 = c(1, 1, 1, 1, 1, 0, 0), X12 = c(0, 1, 0, 0, 1, 0, 0 ), X13 = c(0, 1, 0, 1, 1, 0, 0), X14 = c(1, 1, 0, 1, 1, 0, 0), X15 = c(1, 1, 1, 1, 1, 0, 0), X16 = c(0, 1, 1, 1, 1, 0, 0), X17 = c(0, 1, 1, 0, 1, 0, 0), X18 = c(0, 1, 1, 0, 1, 0, 0), X19 = c(1, 1, 0, 0, 1, 0, 0), X20 = c(1, 1, 1, 0, 1, 0, 0), X21 = c(0, 1, 0, 1, 1, 0, 0), X22 = c(1, 1, 0, 0, 1, 0, 0), X23 = c(1, 1, 1, 0, 1, 0, 0), X24 = c(1, 1, 0, 0, 1, 0, 0), X25 = c(1, 1, 0, 0, 1, 0, 0), X26 = c(0, 1, 1, 1, 1, 0, 0), X27 = c(0, 1, 0, 0, 1, 0, 0), X28 = c(1, 1, 0, 0, 1, 0, 0), X29 = c(1, 1, 1, 1, 0, 0, 0), X30 = c(1, 1, 0, 0, 0, 1, 0)), row.names = c("r1", "r2", "r3", "r4", "r5", "r6", "C1"))
logistic regression analyses
R1 <- glm(r1 ~ C1, data=dat, family=binomial); coef(summary(R1))[,2]
R2 <- glm(r2 ~ C1, data=dat, family=binomial); coef(summary(R2))[,2]
R3 <- glm(r3 ~ C1, data=dat, family=binomial); coef(summary(R3))[,2]
R4 <- glm(r4 ~ C1, data=dat, family=binomial); coef(summary(R4))[,2]
R5 <- glm(r5 ~ C1, data=dat, family=binomial); coef(summary(R5))[,2]
R6 <- glm(r6 ~ C1, data=dat, family=binomial); coef(summary(R6))[,2]
The real data have 6000 row so it is not possible to do one by one row against C1.
Is there is a way to do it in loop where glm is calculated for each row from R1 to R6 against C1 and extract the output into a new column?
CodePudding user response:
I think this is what you are looking for.
library(tidyverse)
dat %>%
rownames_to_column("var") %>%
pivot_longer(cols = -var) %>%
mutate(grp = ifelse(grepl("r", var), "x", "y")) %>%
group_split(grp) %>%
reduce(full_join, by = "name") %>%
nest(data = -var.x) %>%
mutate(reg = map_dbl(data, ~glm(value.x ~ value.y, data=.x, family=binomial) %>%
summary() %>%
coef() %>%
.[,2])) %>%
select(-data)
#> # A tibble: 6 x 2
#> var.x reg
#> <chr> <dbl>
#> 1 r1 0.373
#> 2 r2 39436.
#> 3 r3 0.366
#> 4 r4 0.373
#> 5 r5 0.732
#> 6 r6 0.413
First we make your row names into a variable name with rownames_to_column
, then we take the dataframe from wide to long using pivot_longer
. Then we split the dataframe based on your "x" variable and your "y" variable for your logistic regression. I then join the split dataframe with itself to get every combination of x and y. Lastly, I nest
the dataframe and run all the regressions for each "row".
Let me know if anything is unclear.
CodePudding user response:
library(tidyverse)
independent_var <- 'C1'
dependent_vars <- setdiff(rownames(dat),independent_var)
compare <- NULL
for(i in dependent_vars){
modelname <- toupper(i)
filtered_data <- dat %>%
t %>%
data.frame %>%
select(all_of(c(i,'C1')))
eval(parse(text=sprintf('%s <- glm(%s ~ %s,data=filtered_data,family=binomial)',modelname,i,independent_var)))
eval(parse(text=sprintf('newrow <- data.frame(model="%s",coef=as.numeric(%s$coefficients[-2]))',modelname,modelname)))
compare <- rbind(compare,newrow)
}
compare
output;
model coef
<chr> <dbl>
1 R1 0.405
2 R2 25.6
3 R3 -0.134
4 R4 -0.405
5 R5 2.64
6 R6 -1.01
CodePudding user response:
This can be done for any number of lines. First, however, a small note. In your data, row C1
contained all the values 0
. This, of course, was not suitable for the glm
function. So I had to edit the C1
line a bit.
library(tidyverse)
dat <- data.frame(list(X1 = c(1, 1, 0, 1, 1, 1, 1), X2 = c(1, 1, 0, 1, 1, 1, 1), X3 = c(1, 1, 1, 0, 1, 1, 1), X4 = c(1, 1, 0, 1, 1, 1, 0), X5 = c(0, 1, 0, 0, 1, 1, 0), X6 = c(1, 1, 1, 0, 1, 1, 0), X7 = c(0, 1, 0, 1, 1, 1, 0), X8 = c(0, 1, 1, 0, 1, 0, 0), X9 = c(0, 1, 1, 0, 1, 0, 0), X10 = c(1, 1, 1, 0, 1, 0, 0), X11 = c(1, 1, 1, 1, 1, 0, 0), X12 = c(0, 1, 0, 0, 1, 0, 0 ), X13 = c(0, 1, 0, 1, 1, 0, 1), X14 = c(1, 1, 0, 1, 1, 0, 1), X15 = c(1, 1, 1, 1, 1, 0, 0), X16 = c(0, 1, 1, 1, 1, 0, 0), X17 = c(0, 1, 1, 0, 1, 0, 0), X18 = c(0, 1, 1, 0, 1, 0, 0), X19 = c(1, 1, 0, 0, 1, 0, 0), X20 = c(1, 1, 1, 0, 1, 0, 0), X21 = c(0, 1, 0, 1, 1, 0, 0), X22 = c(1, 1, 0, 0, 1, 0, 0), X23 = c(1, 1, 1, 0, 1, 0, 0), X24 = c(1, 1, 0, 0, 1, 0, 0), X25 = c(1, 1, 0, 0, 1, 0, 0), X26 = c(0, 1, 1, 1, 1, 0, 0), X27 = c(0, 1, 0, 0, 1, 0, 0), X28 = c(1, 1, 0, 0, 1, 0, 0), X29 = c(1, 1, 1, 1, 0, 0, 0), X30 = c(1, 1, 0, 0, 0, 1, 0)), row.names = c("r1", "r2", "r3", "r4", "r5", "r6", "C1"))
output
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30
r1 1 1 1 1 0 1 0 0 0 1 1 0 0 1 1 0 0 0 1 1 0 1 1 1 1 0 0 1 1 1
r2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
r3 0 0 1 0 0 1 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0 0 1 0 0 1 0 0 1 0
r4 1 1 0 1 0 0 1 0 0 0 1 0 1 1 1 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0
r5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
r6 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
C1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Then I split your data into rows r
and row C1
.
dfR = dat %>% as_tibble() %>% slice_head(n=nrow(.)-1)
dfC = dat %>% as_tibble() %>% slice_tail()
output dfR
# A tibble: 6 x 30
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 1 0 1 0 0 0 1 1 0 0 1 1 0 0 0 1 1 0
2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
3 0 0 1 0 0 1 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0
4 1 1 0 1 0 0 1 0 0 0 1 0 1 1 1 1 0 0 0 0 1
5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
6 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# ... with 9 more variables: X22 <dbl>, X23 <dbl>, X24 <dbl>, X25 <dbl>, X26 <dbl>, X27 <dbl>, X28 <dbl>, X29 <dbl>, X30 <dbl>
output dfC
# A tibble: 1 x 30
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0
# ... with 9 more variables: X22 <dbl>, X23 <dbl>, X24 <dbl>, X25 <dbl>, X26 <dbl>, X27 <dbl>, X28 <dbl>, X29 <dbl>, X30 <dbl>
Next, I have prepared four simple functions. fGetRowValue
returns a vector with the values of one row. fGetData
creates a tibble
with two variables x
and C
where x
is the value of a single row and C
is of course the values from your C1
row. fglm
returns my model and fCoef
returns tibble
containing the coefficients from the model.
fGetRowValue = function(data) data %>%
pivot_longer(everything()) %>% pull(value)
fGetData = function(dataR, dataC) tibble(
x = fGetRowValue(dataR),
C = fGetRowValue(dataC)
)
fglm = function(data) glm(x~C, binomial, data)
fCoef = function(model) tibble(
Intercept = coef(model)[1],
C = coef(model)[2]
)
Now is the time for the proper calculations.
dfR %>% mutate(row = 1:nrow(.)) %>%
group_by(row) %>%
nest() %>% #1
group_modify(~fGetData(.x$data[[1]], dfC)) %>% #2
nest() %>% #3
mutate(model = map(data, ~fglm(.x))) %>% #4
mutate(Ceof = map(model, ~fCoef(.x))) %>% #5
unnest(Ceof)
output
# A tibble: 6 x 5
# Groups: row [6]
row data model Intercept C
<int> <list> <list> <dbl> <dbl>
1 1 <tibble [30 x 2]> <glm> 0.241 1.15e 0
2 2 <tibble [30 x 2]> <glm> 25.6 -1.98e-9
3 3 <tibble [30 x 2]> <glm> 0.0800 -1.47e 0
4 4 <tibble [30 x 2]> <glm> -0.754 2.14e 0
5 5 <tibble [30 x 2]> <glm> 2.44 1.71e 1
6 6 <tibble [30 x 2]> <glm> -1.39 1.79e 0
As the way I do my calculations can be a little confusing let me describe it year by year (see comment numbers).
After step 1, we receive the data in this form
# A tibble: 6 x 2
# Groups: row [6]
row data
<int> <list>
1 1 <tibble [1 x 30]>
2 2 <tibble [1 x 30]>
3 3 <tibble [1 x 30]>
4 4 <tibble [1 x 30]>
5 5 <tibble [1 x 30]>
6 6 <tibble [1 x 30]>
Variables data holds a tibble
list with values on one row.
After step two, we have it as it is
# A tibble: 180 x 3
# Groups: row [6]
row x C
<int> <dbl> <dbl>
1 1 1 1
2 1 1 1
3 1 1 1
4 1 1 0
5 1 0 0
6 1 1 0
7 1 0 0
8 1 0 0
9 1 0 0
10 1 1 0
# ... with 170 more rows
We roll it back up in step three
# A tibble: 6 x 2
# Groups: row [6]
row data
<int> <list>
1 1 <tibble [30 x 2]>
2 2 <tibble [30 x 2]>
3 3 <tibble [30 x 2]>
4 4 <tibble [30 x 2]>
5 5 <tibble [30 x 2]>
6 6 <tibble [30 x 2]>
Then in step four we complete our tibble
with a variable containing the list of glm
models.
# A tibble: 6 x 3
# Groups: row [6]
row data model
<int> <list> <list>
1 1 <tibble [30 x 2]> <glm>
2 2 <tibble [30 x 2]> <glm>
3 3 <tibble [30 x 2]> <glm>
4 4 <tibble [30 x 2]> <glm>
5 5 <tibble [30 x 2]> <glm>
6 6 <tibble [30 x 2]> <glm>
We have already taken the last step, which is to download the coefficients from our models.
# A tibble: 6 x 4
# Groups: row [6]
row data model Ceof
<int> <list> <list> <list>
1 1 <tibble [30 x 2]> <glm> <tibble [1 x 2]>
2 2 <tibble [30 x 2]> <glm> <tibble [1 x 2]>
3 3 <tibble [30 x 2]> <glm> <tibble [1 x 2]>
4 4 <tibble [30 x 2]> <glm> <tibble [1 x 2]>
5 5 <tibble [30 x 2]> <glm> <tibble [1 x 2]>
6 6 <tibble [30 x 2]> <glm> <tibble [1 x 2]>
Now all you need to do is unnest
and we have the result you want.
# A tibble: 6 x 5
# Groups: row [6]
row data model Intercept C
<int> <list> <list> <dbl> <dbl>
1 1 <tibble [30 x 2]> <glm> 0.241 1.15e 0
2 2 <tibble [30 x 2]> <glm> 25.6 -1.98e-9
3 3 <tibble [30 x 2]> <glm> 0.0800 -1.47e 0
4 4 <tibble [30 x 2]> <glm> -0.754 2.14e 0
5 5 <tibble [30 x 2]> <glm> 2.44 1.71e 1
6 6 <tibble [30 x 2]> <glm> -1.39 1.79e 0
Note that by the way, all the models and the data used for their construction have been preserved. You can use it freely for further calculations.