Chi squared test of independence across all variables-CodePudding

In my dataset I have many categorical variables and I want to pick up associations between these variables. However, I am struggling to figure out how to automate it so I do not have to do a Chi sq test between every pair.

For example, let's say I have a dataframe.

#Create variables
set.seed(123)
fruit <-c('apple','orange','orange','pear')
fav_number <- seq(from=1,to=4,1)
place <- c('nigeria','india','usa','mexico')
weather <- c('summer','winter','spring','summer')
car <- c('bmw','mercedes','honda','honda')

#Create dataframe
df <- as.data.frame(cbind(fruit,fav_number,place,weather,car))

#Convert all columns to factors
df[sapply(df, is.character)] <- lapply(df[sapply(df, is.character)], 
                                       as.factor)

So my output/df looks like:

   fruit fav_number   place weather      car
1  apple          1 nigeria  summer      bmw
2 orange          2   india  winter mercedes
3 orange          3     usa  spring    honda
4   pear          4  mexico  summer    honda

I can do a Chi sq test between two variables with:

chisq.test(table(df$place,df$fav_number))

But I want to do the same test with every variable with the other. The output I'm looking for is similar to what you would get with a corr matrix with continuous variables.

CodePudding user response：

Using outer.

## chi^2
outer(df, df, Vectorize(\(x, y) chisq.test(table(x, y), sim=TRUE)$statistic))
#            fruit fav_number place weather car
# fruit          8          8     8       4   5
# fav_number     8         12    12       8   8
# place          8         12    12       8   8
# weather        4          8     8       8   5
# car            5          8     8       5   8

## p-value
outer(df, df, Vectorize(\(x, y) chisq.test(table(x, y), sim=TRUE)$p.value))
#                fruit fav_number place   weather       car
# fruit      0.1699150          1     1 1.0000000 0.8385807
# fav_number 1.0000000          1     1 1.0000000 1.0000000
# place      1.0000000          1     1 1.0000000 1.0000000
# weather    1.0000000          1     1 0.1749125 0.8255872
# car        0.8255872          1     1 0.8430785 0.1704148

Please note that we use simulate.p.value=TRUE here to get rid of the warnings that the "approximation may be incorrect". This post on Cross Validated elaborates on that topic.

Data:

df <- structure(list(fruit = structure(c(1L, 2L, 2L, 3L), levels = c("apple", 
"orange", "pear"), class = "factor"), fav_number = structure(1:4, levels = c("1", 
"2", "3", "4"), class = "factor"), place = structure(c(3L, 1L, 
4L, 2L), levels = c("india", "mexico", "nigeria", "usa"), class = "factor"), 
    weather = structure(c(2L, 3L, 1L, 2L), levels = c("spring", 
    "summer", "winter"), class = "factor"), car = structure(c(1L, 
    3L, 2L, 2L), levels = c("bmw", "honda", "mercedes"), class = "factor")), row.names = c(NA, 
-4L), class = "data.frame")

CodePudding user response：

#Create variables
set.seed(123)
fruit<-c('apple','orange','orange','pear')
fav_number<-seq(from=1,to=4,1)
place<-c('nigeria','india','usa','mexico')
weather<-c('summer','winter','spring','summer')
car<-c('bmw','mercedes','honda','honda')

#Create dataframe
df<-as.data.frame(cbind(fruit,fav_number,place,weather,car))

#Convert all columns to factors
df[sapply(df,is.character)]<-lapply(df[sapply(df,is.character)],as.factor)
eg<-expand.grid(names(df),names(df))
eg<-eg[-which(eg$Var1==eg$Var2),]

for(i in 1:nrow(eg)) {
  print(rep("#",20))
  cat(as.character(eg[i,1]),as.character(eg[i,2]),"\n")
  print(chisq.test(table(df[,eg[i,1]],df[,eg[i,2]])))
}

CodePudding user response：

Using combn to get all combinations

all_combos <- t(combn(names(df),2))
all_chis <- apply(all_combos, 1, \(x) chisq.test(table(df[x])))

lbls <- paste0(all_combos[,1],"_",all_combos[,2])
names(all_chis) <- lbls

The output is a list

> all_chis
$fruit_fav_number

    Pearson's Chi-squared test

data:  table(df[x])
X-squared = 8, df = 6, p-value = 0.2381


$fruit_place

    Pearson's Chi-squared test

data:  table(df[x])
X-squared = 8, df = 6, p-value = 0.2381


$fruit_weather

    Pearson's Chi-squared test

data:  table(df[x])
X-squared = 4, df = 4, p-value = 0.406


$fruit_car

    Pearson's Chi-squared test

data:  table(df[x])
X-squared = 5, df = 4, p-value = 0.2873


$fav_number_place

    Pearson's Chi-squared test

data:  table(df[x])
X-squared = 12, df = 9, p-value = 0.2133


$fav_number_weather

    Pearson's Chi-squared test

data:  table(df[x])
X-squared = 8, df = 6, p-value = 0.2381


$fav_number_car

    Pearson's Chi-squared test

data:  table(df[x])
X-squared = 8, df = 6, p-value = 0.2381


$place_weather

    Pearson's Chi-squared test

data:  table(df[x])
X-squared = 8, df = 6, p-value = 0.2381


$place_car

    Pearson's Chi-squared test

data:  table(df[x])
X-squared = 8, df = 6, p-value = 0.2381


$weather_car

    Pearson's Chi-squared test

data:  table(df[x])
X-squared = 5, df = 4, p-value = 0.2873

I think there is a way to edit the data field in the chisq.test output, but I haven't figured out how to access it. The workaround is to use the lbls I created. But that seems too hacky.