Home > database >  Create columns from text in the cells of other columns in a dataframe in R
Create columns from text in the cells of other columns in a dataframe in R

Time:11-18

First, let's use this part of the dataframe I am using:

x <- structure(list(element = c("lt P4", "lt P4", "m fragment", "rt M3 (fragment)", 
"lt P4", "dP4/", "lt P3", "lt M1", "rt I2", "dP/4", "lt P4 (unerupted germ)", 
"rt m2", "rt m2", "rt m3", "rt i1", "rt m2 or 3", "rt m2 or 3", 
"rt m3 frag", "rt I1", "rt m3", NA, "rt m1 frag", "rt m2", "lt M1", 
"rt dp4", "rt M2 frag", "lt m2", "rt i2 frag", "lt P3 frag", 
"lt M1 frag", "rt m2 frag", "lt m frag", "rt dP4 frag", "rt M3", 
NA, "rt M2", "rt m1", "lt M1 frag", "lt m2 frag", "rt c", "rt P4 frag", 
"lt p3 frag", "rt m3", "rt m", "rt M1", "lt m3", "rt m3", "lt M3", 
"rt P4 frag", "lt M1  (2?)", "rt P3", "m3", "p4 (P4?)", "lt P3 frag", 
"lt m1", "rt p3", "rt m frag", "rt p4 frag", "rt dp4 frag", "lt C", 
"lt P4 frag", "rt M frag", "rt M3 frag", "rt P4 frag", "molar frag", 
"rt m frag", "rt m3 frag", "lt M3", "rt m3", "lt m3 frag", "lt p4", 
"lt p4", "rt p3 (in mand)", "rt P4 (in mand)", "rt m2 (in mand)", 
"lt P3", "rt m1", "lt p3 frag (no - rt p4)", "lt m2 frag", "lt M frag (M1)", 
"lt M2", "lt m1", "rt m3", "lt M1", "lt M2", "lt M1", "rt I1", 
"lt m frag", "rt m frag", "rt p4", "rt p4", "dP4", "lt p4 (in mand frag)", 
"lt I1", "rt dp4", "rt m2", "lt p4 (this is an unworn rt M3)", 
"lt m2", "rt m3 frag", "lt P3 frag (no - rt P3)", "lt m3", "lt p4", 
"rt p4 (could be M3)", "rt molar frag", "lt molar frag", "lt m1", 
"lt dp4", "lt dp3or4", "lt dp3 (looks like rt dm2)", "rt P4 frag", 
"lt m2", "r c (in mand)", "r p4 (in mand)", "rt m3", "lt p4 frag", 
"lt m3 (in mand)", "lt p4 (in mand)", "rt m3 (in mand)", "lt m1", 
"rt m2 frag", "lt p3 (in mand)", "lt p4 (in mand)", "rt p3 (in mand)", 
"rt m2 (in mand)", "lt M1, lt M2, lt M1 frag, lt M", "lt P4 frag, lt m frag", 
"lt M1", "rt dM1", "lt dM1", "lt dM2", "lt P4", "frag lt P3, lt M1, M2, M3, rt P3, P4,M1,M2,M3,", 
"rt P4", "rt P3", "lt M1", "rt M2", "lt M2", "rt M1", "rt M3", 
"lt M3", "rt m3 frag", "lt M3", "lt molar frag", "rt m1 or 2", 
"M3", "lt P3or4", "rt p3", "lt m3", "lt m2", "rt m1or2 frag", 
"lt M3", "rt i2", "lt M1 (how would you know? Too fragmentary)", 
"M frag", "maxilla", "maxilla frag", "lt m2", "lt dp4 (fragment)", 
"rt m (frag)", "lt m1", "tooth fragment", "lt M3 germ", "lt M3", 
"lt M1", "lt m3", "rt p3", "molar fragment (robust?)", "tooth fragments", 
"mand w/no dentition", "rt dM2", "lt p3", "rt M1", "lt p3", "lt m1", 
"lt M3 frag", "lt i1", "lt M1", "lt I1", "rt I2", "rt I2", "rt I1", 
"rt upper C", "rt m1", "lt c", "lt I1", "lt P3 or 4", "lt m1 germ", 
"lt dp3 (in mand)", "lt dp4 (in mand)", "rt m2", "lt M", "rt m3", 
"rt m3", "rt p3", "rt P3 (in max)", "rt P4 (in max)", "rt M1 (in max)", 
"rt m3 germ", "lt m2", "rt m3", "rt M3", "rt M3", "rt M frag", 
"lt m3,", "rt P3", "lt M3", "rt p4", "lt M1 frag", "rt P3", "rt P4 frag", 
"lt P4", "lt M frag", "lt p3", "rt M1", "rt p4", "molar frag", 
"rt m3", "lt p4", "mandible fragment with roots of dp4", "lt m1 germ, incompletely formed", 
"M3/", "m?", "lt m2 or 3", "molar frag", "lt m2or3 (how would you ever know?)", 
"rt M2or1", "rt M1", "rt M frag", "rt M1 (in max)", "rt P4", 
"lt p4 (in mand)", "lt m1 (in mand)", "lt m2 (in mand)", "lt m3 (in mand)", 
"M lower", "premolar", "tooth fragment", "M fragments", "rt m1 frag", 
"lt M", "rt m", "M upper", "M upper", "rt P3or4", "mandibular fragment", 
"lt m1", "rt m2 frag", "rt m3 frag", "M frag", "rt m2", "rt I2", 
"rt m1 or 2 unerupted germ", "hmd", "M upper", "rt mand p4 and symphysis", 
"lt m1", "lt M3 frag", "rt P3", "molar frag", "premolar frag", 
"molar frag", "lt p3", "molar frag", "rt m1 frag", "lt p4 frag", 
"molar frag", "rt M2", "rt p3", "lt P4", "rt p4", "rt p3", "lt m1", 
"rt m3", "rt i2", "mand (rt p4-m2, lt p3-m3), max (ltP3-M2, rtP3-M3)", 
"lt p3", "lt m2", "lt m3", "rt p3", "rt p4", "rt m1", "rt m2", 
"rt P4", "rt M3", "rt m2 frag", "lt molar frag", "lt m3 or p3 frag (I think distal aspect of lt dp4)", 
"lt m3", "tooth frag", "lt M1", "rt m2or3", "lt m2", "lt m1 germ", 
"lt m germ", "lt M1", "lt p4", "rt m1 or 2", "m frag", "lt m (1 or 2)", 
"rt M2", "lt P3 or 4 fragment", "lt p4 (distal fragment) (matches W 7-23)", 
"rt m1or2", "rt M1", NA, "lt P3or4"), `Intervalle stratigraphique...6` = c("B 2", 
"B 2", "B 2", "B 2", "B 2", "B 2", "B 2", "B 2", "B 2", "B 2", 
"B 2", "K 4", "F 1", "F 1", "F 1", "B 9", "B 9", "E 2", "B 9 - C", 
"B 9 - C", "B 9 - C", "?", "B 9 - C", "C 9 ?", "C 9 ?", "B 9 - C", 
"F 0 or F 1?", "B 9 - C", "B 9 - C", "?", "E 5 - F 0-1", "E 5 - F 0-1", 
"E 5 - F 0-1", "C 6", "F 2", "F 2", "E 2", "E 2", "B 10", "F 1", 
"F 1", "F 1", "F 1", "F 1", "F 1", "B 10", "D 1 ?", "E 3", "E 3", 
"E 3", "E 3", "E 3", "E 3", "E 3", "?", "F 0", "F 0", "F 0", 
"F 0", "F 0", "F 0", "F 0", "F 0", "F 0", "F 0", "F 0", "F 0", 
"F 0", "F 0", "F 0", "E 5 - F", "F 3", "G 4", "G 4", "G 4", "D 1", 
"C 5", "F inf", "F inf", "F inf", "D", "C 9", "C 9", "C 9", "C 9", 
"C 9", "C 9", "C 9", "C 9", "C 9", "C 9", "C 7", "C 6", "D 5", 
"C 9?", "C 4 - C 6", "G 3", "G 3", "G 3", "G 3", "G 3", "G 3", 
"G 3", "G 3", "G 3", "G 3", "D 5", "D 5", "D moy", "G 5", "G 4 - G 8", 
"G 3 - G 5", "G 3 - G 5", "B 12 - C 1", "G 5", "G 4 - G 8", "G 4 - G 8", 
"G 4 - G 8", "D 1", "C 5", "F 1", "F 1", "F 1", "F 1", "D 5", 
"D 5", "C 8", "C 8", "C 8", "C 8", NA, "G 27 - G 28", NA, NA, 
NA, NA, NA, NA, NA, NA, "D upper", "D upper", "D upper", "B 9", 
"B 9", "B 10", "F 3", "G 1", "G 1", "G 1", "G 3", "G 3", "G 3", 
"G 3", "G 3", "G 3", "C 5", "C 5", "C 5", "C 5", "C 5", "C 6", 
"E 1", "?", "F 2-F 3", "E 3 - E 5", "C 7", "C", "C 8", "C 8", 
"C 8", "C 8", "C 8", "C 8", "C 8", "C 8", "C 8", "C 8?", "C 8?", 
"C 8?", "C 8?", "G 7", "G 7", "B 2", "?", "B 10", "B 10", "G 7", 
"G 7", "C 7", "?", "B 9 - B 10", "B 10", "G 6 - G 13", "G 8", 
"G 8", "G 8", "C (C8?)", "F 0", "F 0", "F 0", "F 0", "F 0", "F 0", 
"F 0", "F 0", "F 0", "F 0", "F 0", "F 0", "F 0", "F 0", "F 0", 
"F 0", "F 0", "F 0", "F 0", "F 0", "L 9", "L 9", "K 3", "C 4", 
"C 4", "G 1", "G 1", "G 1", "J 4", "J", "K 1", "K 2", "D 4", 
"D 4", "D 4", "D 4", "F", NA, "L", "L", "L", "L", "L", NA, NA, 
"E 1", "E 1", "G 8", "G 8", "G 8", "G 8", "G 8", "?", "E 3", 
NA, NA, "E 4", "E 4", "E 4", "E 4", "E 4", "E 5", "E 5", "E 5", 
"E 5", "E 5", "E 5", "E 5", "?", "?", "C 8", "H 4 - H 6", "G 3 - G 8", 
"G 1 - G 13", "G 1 - G 13", "G 1 - G 13", "G 4 - G 13", "G 4 - G 13", 
"G 4 - G 13", "G 4 - G 13", "G 4 - G 13", "G 4 - G 13", "G 4 - G 13", 
"G 4 - G 13", "G 4 - G 13", "G 4 - G 13", "F 1 - F 3", "F 1 - F 3", 
"F 1 - F 3", "F 1 - F 3", "F 1 - F 3", "C 8", "C 8", "L 9", "G 8 - G 9", 
"H 5", "F 3", "B 2", "B 2", "B 2", "B 2", "B 2", "B 2", "B 2", 
"B 2", "B 2", NA, "B 2")), row.names = c(NA, -306L), class = c("tbl_df", 
"tbl", "data.frame"))

If we see its structure (str(x)), we see it is composed of 2 columns (variables) and 306 rows.

tibble [306 × 2] (S3: tbl_df/tbl/data.frame)
$ element                       : chr [1:306] "lt P4" "lt P4" "m fragment" "rt M3 (fragment)" ...
$ Intervalle stratigraphique...6: chr [1:306] "B 2" "B 2" "B 2" "B 2" ...

If we observe the head of the dataframe (head(x, n=15)), we see that element is composed of different text strings.

# A tibble: 15 × 2
   element                `Intervalle stratigraphique...6`
   <chr>                  <chr>                           
 1 lt P4                  B 2                             
 2 lt P4                  B 2                             
 3 m fragment             B 2                             
 4 rt M3 (fragment)       B 2                             
 5 lt P4                  B 2                             
 6 dP4/                   B 2                             
 7 lt P3                  B 2                             
 8 lt M1                  B 2                             
 9 rt I2                  B 2                             
10 dP/4                   B 2                             
11 lt P4 (unerupted germ) B 2                             
12 rt m2                  K 4                             
13 rt m2                  F 1                             
14 rt m3                  F 1                             
15 rt i1                  F 1  

You can see P4, M3, m2, i1, etc. Ultimately they refer to specific teeth (premolar, molar, etc)

So I would like to create columns based on those specific codes inside the text string:

  • First, a column with those codes, named as Tooth.
  • Second column (Position): if the code is in lower case (p3), it is a lower premolar, therefore it would be L. If it is upper case (P3) it would be upper (U).

So the final output would be:

  • For row 8: Tooth = M1; Position = U
  • For row 14: Tooth = m3; Position = L

How can I do it?

CodePudding user response:

You could use the stringr package to detect and extract that information using regular expressions. grep is an option if you want to go with base R:

x$tooth <- stringr::str_extract(x$element, "\\w\\d")
x$position <- ifelse(stringr::str_detect(x$element,"[[:upper:]]"), "U", "L")

head(x, 15)

   element                `Intervalle stratigraphique...6` tooth position
   <chr>                  <chr>                            <chr> <chr>   
 1 lt P4                  B 2                              P4    U       
 2 lt P4                  B 2                              P4    U       
 3 m fragment             B 2                              NA    L       
 4 rt M3 (fragment)       B 2                              M3    U       
 5 lt P4                  B 2                              P4    U       
 6 dP4/                   B 2                              P4    U       
 7 lt P3                  B 2                              P3    U       
 8 lt M1                  B 2                              M1    U       
 9 rt I2                  B 2                              I2    U       
10 dP/4                   B 2                              NA    U       
11 lt P4 (unerupted germ) B 2                              P4    U       
12 rt m2                  K 4                              m2    L       
13 rt m2                  F 1                              m2    L       
14 rt m3                  F 1                              m3    L       
15 rt i1                  F 1                              i1    L     
  • \\w\\d searches a combination of a letter and a number.

  • [[:upper:]] detects upper case letters

I hope your data is clean enough that this will always give you the right results (it looks like row 10 doesn't quite follow the rules?).

  • Related