R: how do I subset a dataframe?-CodePudding

I want to subset the res.gs dataframe to keep only samples that are mutated. The sample is identified as "mutated" if the first letter in the n_mutated_group1 column is not zero. I then want to create another dataframe as wt.samp, which retains samples that are not mutated.

mut.samp <- res.gs[res.gs %>% mutate(first_letter = substr(n_mutated_group1,1,1))!="0",]
wt.samp <- setdiff(res.gs, mut.samp)


> dput(res.gs)
structure(list(Hugo_Symbol = c("AKAP9", "AKAP9", "ERCC2", "ERCC2", 
"HECTD1", "HECTD1", "HERC1", "HERC1", "KMT2C", "KMT2C", "MACF1", 
"MACF1", "MROH2B", "MROH2B"), Missense_Mutation = c(9L, 9L, 9L, 
9L, 6L, 6L, 8L, 8L, 19L, 19L, 5L, 5L, 5L, 5L), Nonsense_Mutation = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L), Splice_Site = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L), total = c(9L, 
9L, 9L, 9L, 6L, 6L, 8L, 8L, 20L, 20L, 6L, 6L, 6L, 6L), MutatedSamples = c(6L, 
6L, 9L, 9L, 6L, 6L, 6L, 6L, 8L, 8L, 6L, 6L, 6L, 6L), AlteredSamples = c(6L, 
6L, 9L, 9L, 6L, 6L, 6L, 6L, 8L, 8L, 6L, 6L, 6L, 6L), Group1 = c("Non-Responder", 
"Responder", "Non-Responder", "Responder", "Non-Responder", "Responder", 
"Non-Responder", "Responder", "Non-Responder", "Responder", "Non-Responder", 
"Responder", "Non-Responder", "Responder"), Group2 = c("Rest", 
"Rest", "Rest", "Rest", "Rest", "Rest", "Rest", "Rest", "Rest", 
"Rest", "Rest", "Rest", "Rest", "Rest"), n_mutated_group1 = c("0 of 25", 
"6 of 25", "0 of 25", "9 of 25", "0 of 25", "6 of 25", "0 of 25", 
"6 of 25", "1 of 25", "7 of 25", "0 of 25", "6 of 25", "0 of 25", 
"6 of 25"), n_mutated_group2 = c("6 of 25", "0 of 25", "9 of 25", 
"0 of 25", "6 of 25", "0 of 25", "6 of 25", "0 of 25", "7 of 25", 
"1 of 25", "6 of 25", "0 of 25", "6 of 25", "0 of 25"), p_value = c(0.022289766970618, 
0.022289766970618, 0.00163083541184905, 0.00163083541184905, 
0.022289766970618, 0.022289766970618, 0.022289766970618, 0.022289766970618, 
0.0487971536957187, 0.0487971536957187, 0.022289766970618, 0.022289766970618, 
0.022289766970618, 0.022289766970618), OR = c(0, Inf, 0, Inf, 
0, Inf, 0, Inf, 0.111488645279478, 8.96952328636894, 0, Inf, 
0, Inf), OR_low = c(0, 1.33358819424024, 0, 2.56647319276964, 
0, 1.33358819424024, 0, 1.33358819424024, 0.00228988507629356, 
1.0079479819766, 0, 1.33358819424024, 0, 1.33358819424024), OR_high = c(0.749856668137133, 
Inf, 0.38963976043749, Inf, 0.749856668137133, Inf, 0.749856668137133, 
Inf, 0.992114690322592, 436.703138665198, 0.749856668137133, 
Inf, 0.749856668137133, Inf), fdr = c(0.248902397838568, 0.248902397838568, 
0.109265972593886, 0.109265972593886, 0.248902397838568, 0.248902397838568, 
0.248902397838568, 0.248902397838568, 0.467058471087594, 0.467058471087594, 
0.248902397838568, 0.248902397838568, 0.248902397838568, 0.248902397838568
)), row.names = c(NA, -14L), class = "data.frame")

My code is returning 198 mutants and 0 wt.

CodePudding user response：

As Wimpel showed in a comment, and Andrea M showed in an answer, there are multiple approaches you can take here. However, I think the base function subset() may be the simplest.

subset(res.gs, grepl('^[^0]', n_mutated_group1))

Indexing also works.

res.gs[grepl('^[^0]', res.gs$n_mutated_group1), ]

The key is the grepl code from Wimpel.

CodePudding user response：

One way of doing this is with filter from dplyr. You also need to use substr to identify the first character of a column.

library(dplyr)
wt.samp <- res.gs %>% filter(substr(n_mutated_group1, 1, 1) == "0")
res.gs <- res.gs %>% filter(substr(n_mutated_group1, 1, 1) != "0")