Home > Back-end >  how to select a character from a column in a data frame in R?
how to select a character from a column in a data frame in R?

Time:03-20

I have a long format data frame called ind_long_sub . I want to select only characters such sum or mean from the characters in indices column, so that I can create another column contain only characters such as mean and sum. I have tried substr() and strsplit() functions, but could not achieved anything, because the characters in the column have different dimensions.

Any thoughts or ideas please?

# reshape data to long format
> ind_long <- ind_csv %>% pivot_longer(3:12, names_to = "indices")
> view(ind_long)
> #create subset
> ind_long_sub <- subset(ind_long, year >= 2001 & year <= 2003)
> ind_long_sub
# A tibble: 360 x 4
    year month indices           value
   <int> <int> <chr>             <dbl>
 1  2001     1 gridded_idw2_sum 156.  
 2  2001     1 Mada4_sum         87   
 3  2001     1 Mada22_sum       185   
 4  2001     1 Mada38_sum        85   
 5  2001     1 Mada53_sum       180   
 6  2001     1 GDD.AS_sum       546.  
 7  2001     1 GDD.Ch_sum       552.  
 8  2001     1 hum.AS.mean        0.82
 9  2001     1 hum.Ch.mean        0.83
10  2001     1 ndvi.mean          0.72
# ... with 350 more rows
> dput(ind_long_sub)
structure(list(year = c(2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2003L, 2003L, 2003L), month = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L), indices = c("gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean", "gridded_idw2_sum", 
"Mada4_sum", "Mada22_sum", "Mada38_sum", "Mada53_sum", "GDD.AS_sum", 
"GDD.Ch_sum", "hum.AS.mean", "hum.Ch.mean", "ndvi.mean"), value = c(156.37, 
87, 185, 85, 180, 546.2, 552.1, 0.82, 0.83, 0.72, 30.53, 12, 
34, 38, 46, 517.8, 533.2, 0.74, 0.73, 0.42, 207.65, 189, 249, 
174, 286, 577.8, 579.8, 0.8, 0.81, 0.44, 235.14, 210, 289, 239, 
206, 571.7, 579.1, 0.85, 0.83, 0.57, 146.38, 141, 156, 132, 121, 
587.8, 592.9, 0.85, 0.84, 0.62, 167.42, 175, 113, 110, 297, 543.8, 
548.9, 0.85, 0.84, 0.75, 108.58, 100, 123, 137, 66, 566.3, 575, 
0.85, 0.83, 0.62, 154.86, 212, 134, 165, 230, 561, 569.2, 0.85, 
0.84, 0.52, 229.65, 251, 198, 202, 241, 537.9, 533.5, 0.85, 0.86, 
0.39, 343.23, 386, 250, 411, 397, 552.5, 545.6, 0.87, 0.88, 0.48, 
147.07, 174, 147, 94, 233, 540.5, 527.7, 0.83, 0.84, 0.65, 89.54, 
81, 82, 120, 116, 551.2, 538.5, 0.8, 0.82, 0.7, 0.36, 3, 0, 0, 
0, 552.1, 552.3, 0.68, 0.74, 0.53, 1.45, 0, 0, 0, 0, 520.8, 542.8, 
0.64, 0.68, 0.33, 36.97, 6, 61, 25, 79, 615.2, 619.5, 0.67, 0.7, 
0.28, 268.37, 196, 317, 243, 316, 583.3, 585.9, 0.79, 0.81, 0.43, 
122, 95, 93, 187, 162, 597.2, 593.7, 0.8, 0.83, 0.7, 145.84, 
186, 297, 94, 92, 557.1, 554.9, 0.83, 0.85, 0.8, 222.34, 267, 
213, 231, 263, 559.1, 556.5, 0.84, 0.87, 0.75, 200.16, 249, 181, 
257, 245, 553.4, 552, 0.84, 0.87, 0.57, 364.37, 251, 273, 458, 
575, 530, 524.3, 0.85, 0.88, 0.6, 207.03, 183, 187, 168, 245, 
557.3, 555.8, 0.84, 0.87, 0.66, 227.27, 252, 263, 248, 190, 533.6, 
534.7, 0.84, 0.86, 0.79, 145.6, 96, 142, 206, 83, 573.1, 571.5, 
0.8, 0.84, 0.71, 9.89, 2, 9, 2, 49, 563.1, 565.5, 0.71, 0.76, 
0.54, 21.65, 54, 5, 29, 76, 531.2, 539.8, 0.67, 0.72, 0.34, 204.21, 
212, 209, 185, 191, 607.2, 606, 0.73, 0.78, 0.4, 131.67, 70, 
172, 78, 109, 583.6, 586.7, 0.8, 0.81, 0.46, 126.64, 219, 125, 
102, 159, 585.8, 588.5, 0.83, 0.84, 0.76, 173.15, 110, 200, 249, 
123, 552, 547.8, 0.84, 0.85, 0.76, 261.72, 207, 232, 205, 404, 
552.8, 552.4, 0.85, 0.86, 0.58, 291.09, 288, 320, 345, 351, 561, 
554.6, 0.85, 0.87, 0.41, 328.42, 328, 295, 366, 355, 528.3, 523.9, 
0.85, 0.88, 0.56, 524.7, 339, 382, 612, 807, 528.7, 527, 0.88, 
0.9, 0.62, 174.36, 142, 167, 93, 296, 545.9, 538.9, 0.84, 0.85, 
0.68, 42.39, 26, 46, 33, 45, 552.1, 541.8, 0.75, 0.8, 0.79)), row.names = c(NA, 
-360L), class = c("tbl_df", "tbl", "data.frame"))

CodePudding user response:

You could use str_extract:

stringr::str_extract(ind_long_sub$indices,'sum|mean')

  [1] "sum"  "sum"  "sum"  "sum"  "sum"  "sum"  "sum"  "mean" "mean" "mean" "sum" 
 [12] "sum"  "sum"  "sum"  "sum"  "sum"  "sum"  "mean" "mean" "mean" "sum"  "sum"
 ...

CodePudding user response:

A base R option using sub -

We can drop everything till a dot (.) or an underscore (_).

sub('.*(\\.|_)', '', ind_long_sub$indices)

#[1] "sum"  "sum"  "sum"  "sum"  "sum"  "sum"  "sum"  "mean" "mean" "mean" "sum"  "sum" 
#[13] "sum"  "sum"  "sum"  "sum"  "sum"  "mean" "mean" "mean" "sum"  "sum"  "sum"  "sum" 
#[25] "sum"  "sum"  "sum"  "mean" "mean" "mean" "sum"  "sum"  "sum"  "sum"  "sum"  "sum" 
#...
#...

CodePudding user response:

In base R, we can use trimws

trimws(ind_long_sub$indices, whitespace = ".*[._]")

-output

 [1] "sum"  "sum"  "sum"  "sum"  "sum"  "sum"  "sum"  "mean" "mean" "mean" "sum"  "sum"  "sum"  "sum"  "sum"  "sum"  "sum" 
 [18] "mean" "mean" "mean" "sum"  "sum"  "sum"  "sum"  "sum"  "sum"  "sum"  "mean" "mean" "mean" "sum"  "sum"  "sum"  "sum" 
 [35] "sum"  "sum"  "sum"  "mean" "mean" "mean" "sum"  "sum"  "sum"  "sum"  "sum"  "sum"  "sum"  "mean" "mean" "mean" "sum" 
 [52] "sum"  "sum"  "sum"  "sum"  "sum"  "sum"  ...
  •  Tags:  
  • r
  • Related