How to perform column-wise wilcox.test and fisher exact on groups in R-CodePudding

I have data.frame df1:

df1 <- data.frame(
  En_ID = c("KNT00000000003", "KNT00000000005", "KNT00000000419", 
                 "KNT00000000457", "KNT00000000460", "KNT00000000938", 
                 "KNT00000000971", "KNT00000001036", "KNT00000001084", 
                 "KNT00000001167" ), 
  `Nor1` = c(-0.834165161710272, 1.02199443531549, 
                -0.558658947885705, -0.390114219973209, -1.23551839713296, 
                3.11429434221998, 0.283932163407262, -1.16908518620064, 
                -0.597054772455507, -0.593624543273255), 
  `Nor2` = c(-1.18531035488942, 0.423719727339646, -1.23261719368372, 
                0.0855281133529292, -1.52366830232278, 3.36692586561211, 
                1.00323690950956, -0.000211248816114964, -4.74738483548391, 
                -0.318176231083024), 
  `Nor3` = c(-0.262659255267546, 1.3962481061442, -0.548673555705647, 
                -0.0149651083306594, -1.45458689193089, 2.54126941463459, 
                1.17711308509307, -1.19425284921181, 1.17788731755683, 
                -0.367897054652365 ), 
  `Nor4` = c(-0.840752912305256, 0.536548846040064, -0.277409459604357, 
                -0.241073614962264, -0.875313153342293, 1.61789645804321, 
                0.412287101096504, -1.11846661523232, -2.6274528854429, 
                -0.760452698231182), 
  `Tor1` = c(-0.968784779247286, -0.502809694119192, -0.231526399163731, 
                -0.530038395734114, -0.706006018337411, 3.58264357077653, 
                -0.127521010699219, 0.270523387217103, 1.68335644352003, 
                -0.314902131571829), 
  `Tor2` = c(-0.481754175843152, -0.440784040523259, -0.532975340622715, 
                -0.182089795101371, -0.564807490336052, 1.74119896504534, 
                -0.96169805631325, -0.721782763145306, -0.433459827401695, 
                -0.727495835245995 ), 
  `Tor3` = c(-0.889343429110847, 1.07937149728343, -0.215144871523998, 
                -0.92234350748557, -0.832108253417702, 2.02456082994848, 
                -0.0434322861759954, -0.523126561938426, -0.556984056084809, 
                -0.740331742513503), 
  `Tor4` = c(-0.858141567384178, 1.87728717064375, -0.381047638414538, 
                -0.613568289061259, -1.92838339196505, 2.23393705735665, 
                0.635389543483408, -0.466053620529111, -1.50483745357134, 
                -1.33400859143521), 
  `Tor5` = c(-0.486388736112514, 0.789390852922639, -0.869434195504952, 
                -0.70405854858187, -1.16488184095428, 2.91497178849082, 
                -2.10331904053714, -0.571130459068143, -0.219526004620518, 
                -0.301435496557957)
)

How can I get the column-wise Wilcox.test and fisher extract text, comparing Nor1, Nor2, Nor3, and Nor4 columns with Tor1, Tor2, Tor3, Tor4, and Tor5 columns of each row. Then, I would like to add that p-value output of both tests at the end column, resulting in df2:

df2 <- data.frame( En_ID = c("KNT00000000003", "KNT00000000005", "KNT00000000419", "KNT00000000457", "KNT00000000460", "KNT00000000938", "KNT00000000971", "KNT00000001036", "KNT00000001084", "KNT00000001167" ), `Nor1` = c(-0.834165161710272, 1.02199443531549, -0.558658947885705, -0.390114219973209, -1.23551839713296, 3.11429434221998, 0.283932163407262, -1.16908518620064, -0.597054772455507, -0.593624543273255), `Nor2` = c(-1.18531035488942, 0.423719727339646, -1.23261719368372, 0.0855281133529292, -1.52366830232278, 3.36692586561211, 1.00323690950956, -0.000211248816114964, -4.74738483548391, -0.318176231083024), `Nor3` = c(-0.262659255267546, 1.3962481061442, -0.548673555705647, -0.0149651083306594, -1.45458689193089, 2.54126941463459, 1.17711308509307, -1.19425284921181, 1.17788731755683, -0.367897054652365 ), `Nor4` = c(-0.840752912305256, 0.536548846040064, -0.277409459604357, -0.241073614962264, -0.875313153342293, 1.61789645804321, 0.412287101096504, -1.11846661523232, -2.6274528854429, -0.760452698231182), `Tor1` = c(-0.968784779247286, -0.502809694119192, -0.231526399163731, -0.530038395734114, -0.706006018337411, 3.58264357077653, -0.127521010699219, 0.270523387217103, 1.68335644352003, -0.314902131571829), `Tor2` = c(-0.481754175843152, -0.440784040523259, -0.532975340622715, -0.182089795101371, -0.564807490336052, 1.74119896504534, -0.96169805631325, -0.721782763145306, -0.433459827401695, -0.727495835245995 ), `Tor3` = c(-0.889343429110847, 1.07937149728343, -0.215144871523998, -0.92234350748557, -0.832108253417702, 2.02456082994848, -0.0434322861759954, -0.523126561938426, -0.556984056084809, -0.740331742513503), `Tor4` = c(-0.858141567384178, 1.87728717064375, -0.381047638414538, -0.613568289061259, -1.92838339196505, 2.23393705735665, 0.635389543483408, -0.466053620529111, -1.50483745357134, -1.33400859143521), `Tor5` = c(-0.486388736112514, 0.789390852922639, -0.869434195504952, -0.70405854858187, -1.16488184095428, 2.91497178849082, -2.10331904053714, -0.571130459068143, -0.219526004620518, -0.301435496557957),`Tor4` = c(-0.858141567384178, 1.87728717064375, -0.381047638414538, -0.613568289061259, -1.92838339196505, 2.23393705735665, 0.635389543483408, -0.466053620529111, -1.50483745357134, -1.33400859143521), `p-value-wilcox` = c(0.8, 0.3, 0.7, 0.8, 0.9, 0.8, 0.7, -0.5, -0.7, -0.9), `p-value-fisher` = c(0.1, 0.7, 0.3, 0.1, 0.5, 0.3, 0.9, -0.2, -0.9, -0.4) )

Here I putting dummy p-value to provide an outline of the desired output. The real data have >200 columns, but both groups (Nor and Tor) have unequal sample number.

I found some examples from the stack as mentioned below and tried to replicate them but miserably failed.

wil-cox text

fisher extract text

Please help me.

CodePudding user response：

fisher.test requires data of same length so I am assuming you have same number of Nor and Tor columns.

Using dplyr rowwise you may use -

library(dplyr)

df1 %>%
  select(-Tor5) %>%
  rowwise() %>%
  mutate(p.value.wilcox = wilcox.test(c_across(starts_with('Nor')),
                              c_across(starts_with('Tor')))$p.value, 
         p.value.fisher = fisher.test(c_across(starts_with('Nor')),
                              c_across(starts_with('Tor')))$p.value)

Or in base R, using apply -

nor_cols <- grep('Nor', names(df1))
tor_cols <- grep('Tor', names(df1))[-5]

cbind(df1, t(apply(df1[-1], 1, function(x) 
      c(p.value.wilcox = wilcox.test(x[nor_cols], x[tor_cols])$p.value,
        p.value.fisher = fisher.test(x[nor_cols], x[tor_cols])$p.value))))

CodePudding user response：

Fisher's exact test is used for categorical variables, and I am not sure you can apply this to your continuous data. For the Mann Whitney you could use this:

for (i in c(1:length(df1[,1]))){
  test_list <- wilcox.test(as.numeric(df1[i,c(2:5)]), as.numeric(df1[i,6:10]), exact = FALSE)
  df1[i,"p_val_MW"] <- test_list[[3]]
}