This is how example data look like:
exp_data
# Name Greg Matt
# 1 Y.L[ 12,000]STISKDLITY.M NA L[ 12]
# 2 Y.L[ 12,000]STISKDLITY.M NA L[ 12]
dput:
exp_data <- structure(list(Name = structure(c(71L,71L), .Label = c("F.AM[ 15,995]KTKAAL.A", "F.AMKTKAAL.A", "F.EKIKAAY.L",
"F.EKIKAAYL.S", "F.NPTAGC[ 58,005]ASL[ 12,000]AKEM[ 12,000]F[ 1151,607].A",
"F.QGRVTM[ 15,995].T", "F.SGSNSGNTATL.T", "F.TGYY.M", "F.TNC[ 58,005]DF[ 1151,607]EKIKAAY.L",
"L.DKSITSL[ 370,222]Y.A", "L.DY[ 12,000]WGQGTL.V", "L.DYWGQGTL.V",
"L.EQVSQL.Q", "L.EQVSQLQGLW.R", "L.EWMGW.I", "L.ITY[ 1151,607]M[ 15,995]SGTKSTEF.N",
"L.KQQGGGLEVL.F", "L.KQQGGGLEVLF.Q", "L.L[ 504,270]KQQGGGLEVL.F",
"L.LKQQGGGL.E", "L.LKQQGGGLEVL.F", "L.QGLW.R", "L.RSDDTAVY.Y",
"L.RSDDTAVYY.C", "L.SRLRSDDTAVY.Y", "L.SRLRSDDTAVYY.C", "L.STISKDL[ 12,000]ITY.M",
"L.STISKDLITY.M", "L.STISKDLITY[ 1012,607]M[ 15,995].S", "L.STISKDLITY[ 12,000].M",
"L.STISKDLITY[ 12,000]M[ 386,228].S", "L.STISKDLITY[ 2918,448].M",
"L.STISKDLITY[ 762,322]M[ 15,995].S", "L.STISKDLITYM.S", "L.STISKDLITYM[ 1282,648].S",
"L.STISKDLITYM[ 1456,695].S", "L.STISKDLITYM[ 1490,759].S", "L.STISKDLITYM[ 371,206].S",
"L.TEIQSL.T", "L.TISRVEAGDEADY.Y", "L.TISRVEAGDEADY[ 12,000]Y.C",
"L.TISRVEAGDEADYY.C", "L.TISRVEAGDEADYY[ 12,000].C", "L.VTVSSGGGSEGGGSEGGGSEGGGSGSY.V",
"L.VTVSSGGGSEGGGSEGGGSEGGGSGSY[ 1239,661].V", "L.VTVSSGGGSEGGGSEGGGSEGGGSGSY[ 1987,847].V",
"L.VVY[ 1501,680]DDSDRPSGIPERF.S", "L.VVYDDSDRPSGIPERF.S", "M.KKARKSKVTTNKC[ 58,005]L[ 2909,467]EQVSQLQGL.W",
"M.SGTKSTEF.N", "M.TELDYW.G", "M.TRDTSISTAY.M", "M.TRDTSISTAY[ 12,000].M",
"M.TRDTSISTAYM.E", "M.TRDTSISTAYMEL.S", "W.GQGTL.V", "W.GQGTLVTVSSGGGSEGGGSEGGGSEGGGSGSY.V",
"W.GQGTLVTVSSGGGSEGGGSEGGGSEGGGSGSY[ 1239,661].V", "W.INPNSGGTNY.A",
"W.INPNSGGTNY[ 12,000].A", "W.VRQAPGQGL.E", "W.VRQAPGQGLEW.M",
"W.VRQAPGQGLEW[ 12,000]M[ 486,244].G", "W.VRQAPGQGLEWM.G", "W.Y[ 12,000]QQKPGQAPVLVVY.D",
"W.YQQKPGQAPVL.V", "W.YQQKPGQAPVL[ 12,000]VVY.D", "W.YQQKPGQAPVLVVY.D",
"Y.AQKF.Q", "Y.DDSDRPSGIPERF.S", "Y.L[ 12,000]STISKDLITY.M",
"Y.LSTISKDL.I", "Y.LSTISKDL[ 12,000]ITY.M", "Y.LSTISKDLITY.M",
"Y.M[ 12,000]SGTKSTEF.N", "Y.M[ 15,995]EL.S", "Y.M[ 15,995]ELSRL.R",
"Y.M[ 15,995]SGTKSTEF.N", "Y.MELSRL.R", "Y.MSGTKSTEF.N", "Y.QQKPGQAPVL.V",
"Y.QQKPGQAPVL[ 12,000]VVY.D", "Y.QQKPGQAPVL[ 12,000]VVYDDSDRPSGIPERF.S",
"Y.QQKPGQAPVLVVY.D", "Y.QQKPGQAPVLVVYDDSDRPSGIPERF.S", "Y.TFTGY.Y",
"Y.TFTGYY.M", "Y.TILDKSITSL.Y", "Y.VLTQPPSVSVAPGQTARITC[ 58,005]GGNNIGSKSVHW.Y",
"Y.WGQGTL.V", "Y.YMHW.V"), class = "factor"), Greg = c(NA,
NA), Matt = structure(c(6L, 6L), .Label = c("","C[ 58]", "C[ 58], F[ 1152]", "C[ 58], F[ 1152], L[ 12], M[ 12]",
"C[ 58], L[ 2909]", "L[ 12]", "L[ 370]", "L[ 504]", "M[ 12]",
"M[ 1283]", "M[ 1457]", "M[ 1491]", "M[ 16]", "M[ 16], Y[ 1013]",
"M[ 16], Y[ 1152]", "M[ 16], Y[ 762]", "M[ 371]", "M[ 386], Y[ 12]",
"M[ 486], W[ 12]", "Y[ 12]", "Y[ 1240]", "Y[ 1502]", "Y[ 1988]",
"Y[ 2918]"), class = "factor")), row.names = 1:2, class = "data.frame")
I would like to focus on column named Name
and keep only letters in all of the rows. Data frame is extremely long and rows contains all type of characters (numbers, dots, question marks, etc) at the begining of the string, in the middle, in the end, between specific letters. I would like to keep only letters in all of these rows.
CodePudding user response:
Using gsub
:
exp_data$Name <- gsub("[^A-Za-z] ", "", exp_data$Name)
CodePudding user response:
exp_data$clean_name = gsub(x = exp_data$Name, pattern = "[^a-zA-Z]", replacement = "")
exp_data
# Name Greg Matt clean_name
# 1 Y.L[ 12,000]STISKDLITY.M NA L[ 12] YLSTISKDLITYM
# 2 Y.L[ 12,000]STISKDLITY.M NA L[ 12] YLSTISKDLITYM