use levene_test in map() function from purrr?-CodePudding

Is there a way to do a Levene Test via the map() function from the purrr package? Or is there another simple way to compute a Levene Test over various variables?

My data frame contains various factor and numeric columns, so I tried with map_if(), which works fine, e.g., for Shapiro tests. However, I do not know how to specify the formula. I want to test all my numeric variables against the "Treatment" factor.

library("tidyverse")
library("rstatix")

data <- data.frame(site = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), 
                                    .Label = c("S1 ", "S2 ", "S3 "), class = "factor"), 
                   plot = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L), 
                                    .Label = c(" Tree 1 ", " Tree 2 ", " Tree 3 "), class = "factor"), 
                   Treatment = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L), .Label = c("T1", "T2"), class = "factor"), 
                   flux1 = c(11.52188065, 8.43156699, 4.495312274, -1.866676811, 3.861102035, -0.814742373, 6.51039536, 4.767950345, 10.36544542, 1.065963875), 
                   flux2 = c(0.142259208, 0.04060245, 0.807631744, 0.060127596, -0.157762562, 0.062464942, 0.043147603, 0.495001652, 0.34363348, 0.134183704), 
                   flux3 = c(0.147506197, 1.131009714, 0.038860728, 0.0176834, 0.053191593, 0.047591306, 0.00573377, -0.034926075, 0.123379247, 0.018882469))

map_if(data, is.numeric, levene_test(. ~ Treatment))

Any suggestions? Thanks for your help!

Now also with an reproducible example ;)

CodePudding user response：

Here is an alternative: First pivot to long data,

Then group_by and apply the formula (here flux should be factor!)

library(tidyr)
library(dplyr)

data %>% 
  pivot_longer(
    cols = starts_with("flux"),
    names_to = "flux",
    values_to = "value"
  ) %>%
  mutate(flux = as.factor(flux)) %>% 
  group_by(flux) %>% 
  levene_test(value ~ Treatment)

  flux    df1   df2 statistic     p
  <fct> <int> <int>     <dbl> <dbl>
1 flux1     1     8     0.410 0.540
2 flux2     1     8     2.85  0.130
3 flux3     1     8     1.11  0.323

CodePudding user response：

You can also use summarize a bit more directly. Then pivot and unnest the results.

library(dplyr)
library(tidyr)

data %>% 
  summarize(across(where(is.numeric),
                   ~ list(levene_test(cur_data(), . ~ Treatment)))) %>% 
  pivot_longer(everything(), names_to = "flux", values_to = "levene_test") %>% 
  unnest(levene_test)

Another option is to just feed the variable names into map and create the formula.

library(purrr)

names(data)[map_lgl(data, is.numeric)] %>% 
  set_names() %>% 
  map_dfr(~ levene_test(data, as.formula(paste(.x, "~ Treatment"))), .id = "flux")

Result (for both):

# A tibble: 3 x 5
  flux    df1   df2 statistic     p
  <chr> <int> <int>     <dbl> <dbl>
1 flux1     1     8     0.410 0.540
2 flux2     1     8     2.85  0.130
3 flux3     1     8     1.11  0.323

CodePudding user response：

The issue is that map loops over the columns and it is no longer a data.frame whereas levene_test expects a data.frame/tibble. According to ?levene_test

data - a data frame for evaluating the formula or a model

therefore, instead of using map_if directly, select the columns that are numeric (select(where(is.numeric))), get the column names (names), loop over those in map, select only the 'Treatment' and the looped column from the data, create the formula with reformulate and apply levene_test

library(rstatix)
library(dplyr)
library(purrr)
data %>% 
   select(where(is.numeric)) %>%
   names %>%
   map_dfr(~ data %>%
             select(Treatment, all_of(.x)) %>% 
       {levene_test(reformulate("Treatment", response = names(.)[2]), data = .)
         })

-output

# A tibble: 3 × 4
    df1   df2 statistic     p
  <int> <int>     <dbl> <dbl>
1     1     8     0.410 0.540
2     1     8     2.85  0.130
3     1     8     1.11  0.323

It may also done using across though - i.e. loop across the columns that are numeric in summarise, use the data as cur_data(), create the formula with reformulate, apply the levene_test, return the output in a list, unclass and use bind_rows (because unclass will remove the data.frame attribute from the list)

data %>%
   summarise(across(where(is.numeric),
    ~  list(cur_data() %>%
     levene_test(reformulate("Treatment", response = cur_column()))))) %>% 
   unclass %>%
   unname %>%
   bind_rows
# A tibble: 3 × 4
    df1   df2 statistic     p
  <int> <int>     <dbl> <dbl>
1     1     8     0.410 0.540
2     1     8     2.85  0.130
3     1     8     1.11  0.323

If we need the 'flux' column identifier, either use the summarise step without wrapping the output in a list and then use bind_rows with .id

data %>%
    summarise(across(where(is.numeric),
     ~  cur_data() %>%
      levene_test(reformulate("Treatment", response = cur_column())))) %>%
    unclass %>% 
    bind_rows(.id = 'flux')
# A tibble: 3 × 5
  flux    df1   df2 statistic     p
  <chr> <int> <int>     <dbl> <dbl>
1 flux1     1     8     0.410 0.540
2 flux2     1     8     2.85  0.130
3 flux3     1     8     1.11  0.323

Or another option is with the OP's map_if itself

map_if(data, is.numeric, 
    ~ levene_test(. ~ Treatment, 
  data = tibble(.x, Treatment = data$Treatment) ), .else = ~ NULL) %>% 
   bind_rows(.id = 'flux')
# A tibble: 3 × 5
  flux    df1   df2 statistic     p
  <chr> <int> <int>     <dbl> <dbl>
1 flux1     1     8     0.410 0.540
2 flux2     1     8     2.85  0.130
3 flux3     1     8     1.11  0.323