remove part of string in few rows r-CodePudding

I have datafarme like this

dummy_data <- structure(list(Date = c("24/06/2002", "24/06/2002", "01/07/2002", 
                                     "01/07/2002", "08/07/2002", 
                                     "08/07/2002","15/07/2002","17/07/2002", 
                                     "22/07/2002", "22/07/2002", "29/07/2002"), 
                             Temp_id= c("ABC", "M567", "M567", "M567", "XYZ", "XYZ", 
                                "T300/500,XYZ", "T300/390,XYZ", "0000,M300", "1234,M678", "ABC")), class = 
                           "data.frame", 
                        row.names = c(NA, 
                                      -11L))

In some of the rows in column "temp_id" there is an additional text.

How can I remove the part before ',' and leave the rest of the string in the column?

Required output <-  dummy_data <- structure(list(Date = c("24/06/2002", "24/06/2002", "01/07/2002",   "01/07/2002", "08/07/2002", "08/07/2002","15/07/2002","17/07/2002", 
                                         "22/07/2002", "22/07/2002", "29/07/2002"), 
                                 Temp_id= c("ABC", "M567", "M567", "M567", "XYZ", "XYZ", 
                                    "XYZ", "XYZ", "M300", "M678", "ABC")), class=  "data.frame",  row.names = c(NA,  -11L))

CodePudding user response：

This is your colum Temp_id:

Temp_id= c("ABC", "M567", "M567", "M567", "XYZ", "XYZ", 
           "T300/500,XYZ", "T300/390,XYZ", "0000,M300", "1234,M678", "ABC"))

Which:

 [1] "ABC"          "M567"         "M567"         "M567"         "XYZ"          "XYZ"          "T300/500,XYZ"
 [8] "T300/390,XYZ" "0000,M300"    "1234,M678"    "ABC"

An easy way is using gsub function which replaces the regex pattern you indicate with other expression. In this case we are indicating that everying from the beggining of the line to the first comma - ^.*, - is replaced with nothing - '' .

gsub('^.*,','',Temp_id)

[1] "ABC"  "M567" "M567" "M567" "XYZ"  "XYZ"  "XYZ"  "XYZ"  "M300" "M678" "ABC"

In case you don't understand the regex symbols:

^ -> beginning of line, . -> every character , * -> repeat previous ' . ' until next symbol matches, , -> stop in comma

Applying to the dataframe:

dummy_data$Temp_id = gsub('^.*,','',dummy_data$Temp_id)

> dummy_data
         Date Temp_id
1  24/06/2002     ABC
2  24/06/2002    M567
3  01/07/2002    M567
4  01/07/2002    M567
5  08/07/2002     XYZ
6  08/07/2002     XYZ
7  15/07/2002     XYZ
8  17/07/2002     XYZ
9  22/07/2002    M300
10 22/07/2002    M678
11 29/07/2002     ABC

CodePudding user response：

With dplyr and stringr...

library(dplyr)
library(stringr)


dummy_data |> 
  mutate(Temp_id = case_when(str_detect(Temp_id, ",") ~ str_extract(Temp_id, "(?<=,).*$"),
                             TRUE ~ Temp_id))
#or using `ifelse()`

dummy_data |> 
  mutate(Temp_id = ifelse(str_detect(Temp_id, ","),
                          str_extract(Temp_id, "(?<=,).*$"),
                          Temp_id))

#>          Date Temp_id
#> 1  24/06/2002     ABC
#> 2  24/06/2002    M567
#> 3  01/07/2002    M567
#> 4  01/07/2002    M567
#> 5  08/07/2002     XYZ
#> 6  08/07/2002     XYZ
#> 7  15/07/2002     XYZ
#> 8  17/07/2002     XYZ
#> 9  22/07/2002    M300
#> 10 22/07/2002    M678
#> 11 29/07/2002     ABC

^{Created on 2022-10-13 with reprex v2.0.2}