How can I split the following string using R?-CodePudding

I want to split the following character string from a chess game into separate strings like the ones below removing the "1-9." pattern while maintaining all the other text.

Example:

text <- "1. e4 e5 2. Nf3 Nf6 3. Nxe5 d6 4. Nd3 Nxe4 5. Qe2 Qe7 6. Nf4 Nf6 7. d4 Nc6 8. c3 d5 9. Nd2 Nd8 10. Nf3 Qxe2  11. Bxe2 Bd6 12. O-O O-O 13. Bd3 Re8 14. Re1 Rxe1  15. Nxe1 Ne6 16. Nxe6 Bxe6 17. g3 g6 18. Ng2 Re8 19. f3 Nh5 20. Kf2 c6 21. g4 Ng7 22. Bf4 Bxf4 23. Nxf4 g5 24. Ne2 f5 25. h3 Kf7 26. Rh1 h6 27. f4 fxg4 28. hxg4 Bxg4 29. Rxh6 Bf5 30. Bxf5 Nxf5 31. Rh7  Ng7 32. fxg5 Kg6 33. Rh3 Kxg5 34. Rg3  Kf6 35. Rf3  Ke7 36. Nf4 Kd6 37. Ng6 Re6 38. Ne5 Ne8 39. Rf7 Rf6  40. Rxf6  Nxf6 41. Ke3"

Desired result:

text_outcome <- c("e4", "e5", "Nf3", "Nf6", ..., "0-0", "Rg3 ",)

So far I've tried:

text_outcome <- strsplit(text, "[^1-9.a-zA-Z]")[[1]] |> 
  as.data.frame() |> 
  mutate(text_cleaned = case_when(text_outcome == "O" ~ "O-O",
                       TRUE ~ text_outcome),
         text_cleaned2 = str_replace_all(text_cleaned, "^[1-9].", " ")) |> 
  filter(!text_cleaned2 == "")

obtaining:

  [5] "Nf3"  "Nf6"  " "    "Nxe5"
  [9] "d6"   " "    "Nd3"  "Nxe4"
 [13] " "    "Qe2"  "Qe7"  " "   
 [17] "Nf4"  "Nf6"  " "    "d4"  
 [21] "Nc6"  " "    "c3"   "d5"  
 [25] " "    "Nd2"  "Nd8"  "1"   
 [29] "."    "Nf3"  "Qxe2" " ."  
 [33] "Bxe2" "Bd6"  " ."   "O-O" 
 [37] "O-O"  "O-O"  "O-O"  " ."  
 [41] "Bd3"  "Re8"  " ."   "Re1" 
 [45] "Rxe1" " ."   "Nxe1" "Ne6" 
 [49] " ."   "Nxe6" "Bxe6" " ."  
 [53] "g3"   "g6"   " ."   "Ng2" 
 [57] "Re8"  " ."   "f3"   "Nh5" 
 [61] "2"    "."    "Kf2"  "c6"  
 [65] " ."   "g4"   "Ng7"  " ."  
 [69] "Bf4"  "Bxf4" " ."   "Nxf4"
 [73] "g5"   " ."   "Ne2"  "f5"  
 [77] " ."   "h3"   "Kf7"  " ."  
 [81] "Rh1"  "h6"   " ."   "f4"  
 [85] "fxg4" " ."   "hxg4" "Bxg4"
 [89] " ."   "Rxh6" "Bf5"  "3"   
 [93] "."    "Bxf5" "Nxf5" " ."  
 [97] "Rh7"  "Ng7"  " ."   "fxg5"
[101] "Kg6"  " ."   "Rh3"  "Kxg5"
[105] " ."   "Rg3"  "Kf6"  " ."  
[109] "Rf3"  "Ke7"  " ."   "Nf4" 
[113] "Kd6"  " ."   "Ng6"  "Re6" 
[117] " ."   "Ne5"  "Ne8"  " ."  
[121] "Rf7"  "Rf6"  "4"    "."   
[125] "Rxf6" "Nxf6" " ."   "Ke3"```

I'm sure there's a cleaner way to obtain this while removing the "0-0" duplicates and maintaining the " " symbols such as Rf3 .


Thanks in advance,

CodePudding user response：

library(stringr)
x <- str_remove_all(unlist(str_split(text, ' ')), '\\d \\.')
x[x != ""]

 [1] "e4"    "e5"    "Nf3"   "Nf6"   "Nxe5"  "d6"    "Nd3"  
 [8] "Nxe4"  "Qe2"   "Qe7"   "Nf4"   "Nf6"   "d4"    "Nc6"  
[15] "c3"    "d5"    "Nd2"   "Nd8"   "Nf3"   "Qxe2 " "Bxe2" 
[22] "Bd6"   "O-O"   "O-O"   "Bd3"   "Re8"   "Re1"   "Rxe1 "
[29] "Nxe1"  "Ne6"   "Nxe6"  "Bxe6"  "g3"    "g6"    "Ng2"  
[36] "Re8"   "f3"    "Nh5"   "Kf2"   "c6"    "g4"    "Ng7"  
[43] "Bf4"   "Bxf4"  "Nxf4"  "g5"    "Ne2"   "f5"    "h3"   
[50] "Kf7"   "Rh1"   "h6"    "f4"    "fxg4"  "hxg4"  "Bxg4" 
[57] "Rxh6"  "Bf5"   "Bxf5"  "Nxf5"  "Rh7 "  "Ng7"   "fxg5" 
[64] "Kg6"   "Rh3"   "Kxg5"  "Rg3 "  "Kf6"   "Rf3 "  "Ke7"  
[71] "Nf4"   "Kd6"   "Ng6"   "Re6"   "Ne5"   "Ne8"   "Rf7"  
[78] "Rf6 "  "Rxf6 " "Nxf6"  "Ke3"

CodePudding user response：

You can do:

library(stringr)
unlist(str_split(str_remove_all(text, '\\d{1,}\\. '), ' '))

 [1] "e4"    "e5"    "Nf3"   "Nf6"   "Nxe5"  "d6"    "Nd3"   "Nxe4"  "Qe2"  
[10] "Qe7"   "Nf4"   "Nf6"   "d4"    "Nc6"   "c3"    "d5"    "Nd2"   "Nd8"  
[19] "Nf3"   "Qxe2 " "Bxe2"  "Bd6"   "O-O"   "O-O"   "Bd3"   "Re8"   "Re1"  
[28] "Rxe1 " "Nxe1"  "Ne6"   "Nxe6"  "Bxe6"  "g3"    "g6"    "Ng2"   "Re8"  
[37] "f3"    "Nh5"   "Kf2"   "c6"    "g4"    "Ng7"   "Bf4"   "Bxf4"  "Nxf4" 
[46] "g5"    "Ne2"   "f5"    "h3"    "Kf7"   "Rh1"   "h6"    "f4"    "fxg4" 
[55] "hxg4"  "Bxg4"  "Rxh6"  "Bf5"   "Bxf5"  "Nxf5"  "Rh7 "  "Ng7"   "fxg5" 
[64] "Kg6"   "Rh3"   "Kxg5"  "Rg3 "  "Kf6"   "Rf3 "  "Ke7"   "Nf4"   "Kd6"  
[73] "Ng6"   "Re6"   "Ne5"   "Ne8"   "Rf7"   "Rf6 "  "Rxf6 " "Nxf6"  "Ke3"

CodePudding user response：

In base R:

text |> gsub(pattern = '\\d \\.\\s ', replacement = '') |> 
 strsplit(' ') |> unlist()

[1] "e4"    "e5"    "Nf3"   "Nf6"   "Nxe5"  "d6"    "Nd3"   "Nxe4"  "Qe2"  
[10] "Qe7"   "Nf4"   "Nf6"   "d4"    "Nc6"   "c3"    "d5"    "Nd2"   "Nd8"  
[19] "Nf3"   "Qxe2 " "Bxe2"  "Bd6"   "O-O"   "O-O"   "Bd3"   "Re8"   "Re1"  
[28] "Rxe1 " "Nxe1"  "Ne6"   "Nxe6"  "Bxe6"  "g3"    "g6"    "Ng2"   "Re8"  
[37] "f3"    "Nh5"   "Kf2"   "c6"    "g4"    "Ng7"   "Bf4"   "Bxf4"  "Nxf4" 
[46] "g5"    "Ne2"   "f5"    "h3"    "Kf7"   "Rh1"   "h6"    "f4"    "fxg4" 
[55] "hxg4"  "Bxg4"  "Rxh6"  "Bf5"   "Bxf5"  "Nxf5"  "Rh7 "  "Ng7"   "fxg5" 
[64] "Kg6"   "Rh3"   "Kxg5"  "Rg3 "  "Kf6"   "Rf3 "  "Ke7"   "Nf4"   "Kd6"  
[73] "Ng6"   "Re6"   "Ne5"   "Ne8"   "Rf7"   "Rf6 "  "Rxf6 " "Nxf6"  "Ke3"

CodePudding user response：

Using str_extract

library(stringr)
str_extract_all(text, "[A-Za-z]\\S ")[[1]]
 [1] "e4"    "e5"    "Nf3"   "Nf6"   "Nxe5"  "d6"    "Nd3"   "Nxe4"  "Qe2"   "Qe7"   "Nf4"   "Nf6"   "d4"    "Nc6"   "c3"    "d5"    "Nd2"   "Nd8"   "Nf3"   "Qxe2 "
[21] "Bxe2"  "Bd6"   "O-O"   "O-O"   "Bd3"   "Re8"   "Re1"   "Rxe1 " "Nxe1"  "Ne6"   "Nxe6"  "Bxe6"  "g3"    "g6"    "Ng2"   "Re8"   "f3"    "Nh5"   "Kf2"   "c6"   
[41] "g4"    "Ng7"   "Bf4"   "Bxf4"  "Nxf4"  "g5"    "Ne2"   "f5"    "h3"    "Kf7"   "Rh1"   "h6"    "f4"    "fxg4"  "hxg4"  "Bxg4"  "Rxh6"  "Bf5"   "Bxf5"  "Nxf5" 
[61] "Rh7 "  "Ng7"   "fxg5"  "Kg6"   "Rh3"   "Kxg5"  "Rg3 "  "Kf6"   "Rf3 "  "Ke7"   "Nf4"   "Kd6"   "Ng6"   "Re6"   "Ne5"   "Ne8"   "Rf7"   "Rf6 "  "Rxf6 " "Nxf6" 
[81] "Ke3"