Extract a string from each cell individually and create new data frames-CodePudding

I am currently running an experiment checking the odds of certain bookmakers. The problem is I have what looks like an entire data frame inserted within each cell as a string. One column is a fixture ID, and the second is all the odds from the various bookmakers for that particular fixture but I want to convert this one data frame into many showing the data for each bookmaker individually. I was thinking to make some sort of for-loop to do it all in one fell swoop with the "extract" command but due to how the string is formatted within the cells, R is not liking it.

Not all of the cells hold the same amount of data. An example of what the data within one of the cells look like is as so:

[{'bookmakerId': 22, 'updated': '2021-10-01T14:46:39.890Z', 'homePrice': 1.952, 'drawPrice': 3.2, 'awayPrice': 3.9}, {'bookmakerId': 83, 'updated': '2021-10-01T03:00:51.760Z', 'homePrice': 2.01, 'drawPrice': 3.3, 'awayPrice': 4.15}]

My current data frame looks like this:

Fixture	Data
1	Data1
2	Data2

The format I am wanting is as so:

for bookmaker ID: K, where K is an element of [1..n]

Fixture	Home	draw	away
1	a	b	c
2	d	e	f

Thanks a lot, guys.

edit: dput data

structure(list(fixtureId = c("runningball-adaptor-1510023", "runningball-adaptor-1510018", 
"runningball-adaptor-1510019", "isd-adaptor-8191632", "runningball-adaptor-1510026", 
"runningball-adaptor-1510020"), oneXTwoBookmakers = c("[{'bookmakerId': 22, 'updated': '2021-10-01T14:46:39.890Z', 'homePrice': 1.952, 'drawPrice': 3.2, 'awayPrice': 3.9}, {'bookmakerId': 83, 'updated': '2021-10-01T03:00:51.760Z', 'homePrice': 2.01, 'drawPrice': 3.3, 'awayPrice': 4.15}, {'bookmakerId': 37, 'updated': '2021-10-01T18:08:53.723Z', 'homePrice': 1.97, 'drawPrice': 3.3, 'awayPrice': 4.1}, {'bookmakerId': 17, 'updated': '2021-09-30T19:59:19.163Z', 'homePrice': 1.91, 'drawPrice': 3.0, 'awayPrice': 3.75}, {'bookmakerId': 340, 'updated': '2021-09-30T20:20:33.470Z', 'homePrice': 1.95, 'drawPrice': 3.1, 'awayPrice': 4.2}]", "[{'bookmakerId': 22, 'updated': '2021-10-02T09:09:55.190Z', 'homePrice': 2.625, 'drawPrice': 3.0, 'awayPrice': 2.75}, {'bookmakerId': 83, 'updated': '2021-10-02T09:02:45.117Z', 'homePrice': 2.74, 'drawPrice': 3.11, 'awayPrice': 2.84}, {'bookmakerId': 37, 'updated': '2021-10-02T09:21:07.150Z', 'homePrice': 2.7, 'drawPrice': 3.1, 'awayPrice': 2.8}, {'bookmakerId': 17, 'updated': '2021-10-02T09:05:07.353Z', 'homePrice': 2.55, 'drawPrice': 2.9, 'awayPrice': 2.62}, {'bookmakerId': 340, 'updated': '2021-10-02T09:47:39.697Z', 'homePrice': 2.62, 'drawPrice': 3.0, 'awayPrice': 2.8}]", 
"[{'bookmakerId': 22, 'updated': '2021-10-01T23:32:46.563Z', 'homePrice': 3.3, 'drawPrice': 3.1, 'awayPrice': 2.2}, {'bookmakerId': 83, 'updated': '2021-10-01T14:05:38.270Z', 'homePrice': 3.56, 'drawPrice': 3.02,  'awayPrice': 2.25}, {'bookmakerId': 37, 'updated': '2021-10-01T18:09:33.740Z', 'homePrice': 3.55, 'drawPrice': 3.1, 'awayPrice': 2.25}, {'bookmakerId': 17, 'updated': '2021-10-01T14:11:34.050Z', 'homePrice': 3.2, 'drawPrice': 3.0, 'awayPrice': 2.15}, {'bookmakerId': 340, 'updated': '2021-10-01T15:50:45.820Z', 'homePrice': 3.4, 'drawPrice': 3.0, 'awayPrice': 2.25}]", 
"[{'bookmakerId': 17, 'updated': '2021-10-02T13:42:23.827Z', 'homePrice': 3.1, 'drawPrice': 3.2, 'awayPrice': 2.3}]", "[{'bookmakerId': 22, 'updated': '2021-10-02T16:05:45.170Z', 'homePrice': 1.727, 'drawPrice': 3.4, 'awayPrice': 4.75}, {'bookmakerId': 83, 'updated': '2021-10-01T16:45:18.623Z', 'homePrice': 1.757, 'drawPrice': 3.49, 'awayPrice': 4.91}, {'bookmakerId': 37, 'updated': '2021-10-02T15:25:06.367Z', 'homePrice': 1.75, 'drawPrice': 3.55, 'awayPrice': 5.0}, {'bookmakerId': 17, 'updated': '2021-10-01T17:31:02.897Z', 'homePrice': 1.7, 'drawPrice': 3.25, 'awayPrice': 4.4}, {'bookmakerId': 340, 'updated': '2021-10-01T19:57:13.193Z', 'homePrice': 1.77, 'drawPrice': 3.4, 'awayPrice': 4.75}]", "[{'bookmakerId': 385, 'updated': '2021-10-02T18:55:06.670Z', 'homePrice': 2.59, 'drawPrice': 2.98, 'awayPrice': 2.64}, {'bookmakerId': 22, 'updated': '2021-10-02T17:50:13.473Z', 'homePrice': 2.6, 'drawPrice': 3.0, 'awayPrice': 2.75}, {'bookmakerId': 37, 'updated': '2021-10-02T15:25:06.477Z', 'homePrice': 2.6, 'drawPrice': 3.1, 'awayPrice': 2.9}, {'bookmakerId': 17, 'updated': '2021-10-01T19:28:28.587Z', 'homePrice': 2.45, 'drawPrice': 2.9, 'awayPrice': 2.7}, {'bookmakerId': 327, 'updated': '2021-10-02T18:49:56.213Z', 'homePrice': 2.47, 'drawPrice': 3.05, 'awayPrice': 2.57}, {'bookmakerId': 83, 'updated': '2021-10-01T19:26:07.253Z', 'homePrice': 2.65, 'drawPrice': 3.02, 'awayPrice': 2.89}, {'bookmakerId': 42, 'updated': '2021-10-02T17:49:44.437Z', 'homePrice': 2.6, 'drawPrice': 3.1, 'awayPrice': 2.8}, {'bookmakerId': 340, 'updated': '2021-10-01T23:40:53.500Z', 'homePrice': 2.62, 'drawPrice': 3.0, 'awayPrice': 2.8}, {'bookmakerId': 285, 'updated': '2021-10-02T18:55:03.520Z', 'homePrice': 2.59, 'drawPrice': 2.98, 'awayPrice': 2.64}]"
)), row.names = c(NA, 6L), class = c("tbl_df", "tbl", "data.frame"
))

CodePudding user response：

Here is one method to evaluate the string (py_eval) and then unnest the list column

library(dplyr)
library(reticulate)
library(tidyr)
library(purrr)
df %>% 
  mutate(oneXTwoBookmakers = map(oneXTwoBookmakers, ~ py_eval(.x) %>% 
        bind_rows))  %>% 
  unnest(oneXTwoBookmakers)

-output

# A tibble: 30 × 6
   fixtureId                   bookmakerId updated                  homePrice drawPrice awayPrice
   <chr>                             <int> <chr>                        <dbl>     <dbl>     <dbl>
 1 runningball-adaptor-1510023          22 2021-10-01T14:46:39.890Z      1.95      3.2       3.9 
 2 runningball-adaptor-1510023          83 2021-10-01T03:00:51.760Z      2.01      3.3       4.15
 3 runningball-adaptor-1510023          37 2021-10-01T18:08:53.723Z      1.97      3.3       4.1 
 4 runningball-adaptor-1510023          17 2021-09-30T19:59:19.163Z      1.91      3         3.75
 5 runningball-adaptor-1510023         340 2021-09-30T20:20:33.470Z      1.95      3.1       4.2 
 6 runningball-adaptor-1510018          22 2021-10-02T09:09:55.190Z      2.62      3         2.75
 7 runningball-adaptor-1510018          83 2021-10-02T09:02:45.117Z      2.74      3.11      2.84
 8 runningball-adaptor-1510018          37 2021-10-02T09:21:07.150Z      2.7       3.1       2.8 
 9 runningball-adaptor-1510018          17 2021-10-02T09:05:07.353Z      2.55      2.9       2.62
10 runningball-adaptor-1510018         340 2021-10-02T09:47:39.697Z      2.62      3         2.8 
# … with 20 more rows

Or another option is jsonlite

library(jsonlite)
df %>% 
  mutate(oneXTwoBookmakers = map(oneXTwoBookmakers, 
     ~ fromJSON(chartr("'", '"', .x) ))) %>% 
   unnest(oneXTwoBookmakers)

-output

# A tibble: 30 × 6
   fixtureId                   bookmakerId updated                  homePrice drawPrice awayPrice
   <chr>                             <int> <chr>                        <dbl>     <dbl>     <dbl>
 1 runningball-adaptor-1510023          22 2021-10-01T14:46:39.890Z      1.95      3.2       3.9 
 2 runningball-adaptor-1510023          83 2021-10-01T03:00:51.760Z      2.01      3.3       4.15
 3 runningball-adaptor-1510023          37 2021-10-01T18:08:53.723Z      1.97      3.3       4.1 
 4 runningball-adaptor-1510023          17 2021-09-30T19:59:19.163Z      1.91      3         3.75
 5 runningball-adaptor-1510023         340 2021-09-30T20:20:33.470Z      1.95      3.1       4.2 
 6 runningball-adaptor-1510018          22 2021-10-02T09:09:55.190Z      2.62      3         2.75
 7 runningball-adaptor-1510018          83 2021-10-02T09:02:45.117Z      2.74      3.11      2.84
 8 runningball-adaptor-1510018          37 2021-10-02T09:21:07.150Z      2.7       3.1       2.8 
 9 runningball-adaptor-1510018          17 2021-10-02T09:05:07.353Z      2.55      2.9       2.62
10 runningball-adaptor-1510018         340 2021-10-02T09:47:39.697Z      2.62      3         2.8 
# … with 20 more rows