I have list of 15.000 dataframes.
A reproducible example for a subset of my list:
mylist <- list(`MutA` = structure(list(gene_name = c("EGR1", "FOS", "DUSP6",
"MTND2P12", "PRKG1"), logFC = c(1.76654254980119, 1.17127823620834,
1.15977097733566, -1.0488230301382, -1.14160426859133), AveExpr = c(7.78123576039834,
6.96495213737487, 8.07712204346697, 4.59443820293176, 4.45363638302119
), t = c(5.56926193951565, 4.52588789847687, 4.30658685558903,
-4.03107776879811, -4.01818157353926), P.Value = c(6.26314546243923e-07,
2.88133442317264e-05, 6.1943827508267e-05, 0.000158106269805187,
0.00016507683166428), adj.P.Val = c(0.00269623938482679, 0.0466977246992681,
0.0466977246992681, 0.0466977246992681, 0.0466977246992681),
B = c(4.41885203345315, 0.664723788950823, -0.078687857472298,
-0.984363656764183, -1.02592992743153), Contrast = c("MutA",
"MutA", "MutA", "MutA", "MutA"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-5L)), `MutB` = structure(list(gene_name = c("PNRC1", "CSRNP1",
"PRDM1", "NR4A2", "ZNF296", "SOCS3", "PHLDA2", "CCNG2", "ARRDC3",
"ID1", "DUSP2", "PIM1", "CYP1A1"), logFC = c(-1.49102955200986,
1.14224802134131, -1.15585383787062, -1.02735547786377, 1.02035553608404,
1.10716238341174, 1.05166841049116, -1.65132479200427, -2.18654607181093,
1.77429205659587, 1.03869096951995, -1.3820459712131, -1.20297178923705
), AveExpr = c(9.78988310015957, 8.73421574183686, 8.8685535024984,
6.17004098129659, 8.58244317394986, 8.70912817604271, 9.03859024590436,
10.0151710372285, 8.92709742690307, 11.0093922646056, 7.08299475394084,
10.3343119884904, 9.78294465250344), t = c(-10.3415666933288,
8.44909235249311, -7.31863312686932, -7.0503766462354, 6.88254971039145,
6.84255405881957, 6.6661634308286, -6.57689174112265, -6.35937247336919,
6.35842394120948, 6.09678138349739, -6.03753531213675, -5.11987079003073
), P.Value = c(5.50391454309638e-15, 8.06366982837864e-12, 6.93094932192831e-10,
1.99580135073543e-09, 3.86347446361424e-09, 4.52128252255361e-09,
9.03556813379214e-09, 1.28176994232671e-08, 2.99700607586158e-08,
3.0080997621089e-08, 8.30549307278899e-08, 1.04416635223401e-07,
3.36695021239758e-06), adj.P.Val = c(3.31715425597876e-10, 2.42994658443276e-07,
7.96336272855565e-06, 1.71835645153534e-05, 2.91059678059458e-05,
2.98990253865829e-05, 4.95058778050471e-05, 6.06942793274083e-05,
0.000113309477851588, 0.000113309477851588, 0.00025028188100196,
0.000299670770870435, 0.00509840238461601), B = c(22.9513308908157,
15.6316206051317, 11.1774398161111, 10.1226391151815, 9.46458483520333,
9.30802775208554, 8.61902127709979, 8.27132811596184, 7.42752571474613,
7.42385776969362, 6.41653390787626, 6.18977942295351, 2.76361636124891
), Contrast = c("MutB", "MutB", "MutB",
"MutB", "MutB", "MutB", "MutB",
"MutB", "MutB", "MutB", "MutB",
"MutB", "MutB")), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -13L)))
I would like to test if a certain column in each dataframe contains a given value, and discard those dataframes that do not contain the value.
For example, if I tested for gene_name
column contains the value DUSP2
, I would like to discard the Mut1
dataframe of the list, since it does not contains DUSP2.
Conceptually, it would be something like the following:
lapply(mylist, function(df) {
if(df$gene_name!="DUSP2")
***DELETE_df_from_list***
})
CodePudding user response:
Not sure on efficiency since you have quite a few dataframes but here's an option. Following your logic above you can assign NULL to the table if "DUSP2" is not in gene_name. Then use purrr::discard
to remove NULL tables.
library(purrr)
mylist = lapply(mylist, function(d) {
if(!("DUSP2" %in% d$gene_name)) {
d = NULL
}
return(d)
})
mylist %>%
discard(is.null)