Home > database >  Clean a dataset that includes some character in each row
Clean a dataset that includes some character in each row

Time:01-19

I have a dataset like this:

structure(list(`Frequency
Percent` = c("car", "window", "ball", 
"ups"), AI = c("2\n0.00", "3\n0.00", "1\n0.00", "2\n0.00"), BLK = c("0\n0.00", 
"218\n0.29", "48\n0.06", "0\n0.00"), HIANIC = c("1\n0.00", "8\n0.01", 
"4\n0.01", "0\n0.00"), NATRICAN = c("9\n0.01", "7\n0.01", "8\n0.01", 
"0\n0.00"), UNK = c("15\n0.02", "83\n0.11", "36\n0.05", "0\n0.00"
), yy = c("111\n0.15", "897\n1.20", "756\n1.02", "1\n0.00")), class = "data.frame", row.names = c(NA, 
-4L))

How can I split each row by "" and remove n to make two new columns. For instance, car and AI cell (2\n0.00), I will have 2 and 0.00 in two different columns.

CodePudding user response:

Using tidyr::separate_rows and tidyr::pivot_wider you could do:

library(tidyr)
library(dplyr)

dat |> 
  mutate(unit = c("n\npct")) |> 
  separate_rows(-1, sep = "\n") |> 
  pivot_wider(names_from = "unit", values_from = -1)
#> # A tibble: 4 × 15
#>   Frequency\n…¹ AI_n  AI_pct BLK_n BLK_pct HIANI…² HIANI…³ NATRI…⁴ NATRI…⁵ UNK_n
#>   <chr>         <chr> <chr>  <chr> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>
#> 1 car           2     0.00   0     0.00    1       0.00    9       0.01    15   
#> 2 window        3     0.00   218   0.29    8       0.01    7       0.01    83   
#> 3 ball          1     0.00   48    0.06    4       0.01    8       0.01    36   
#> 4 ups           2     0.00   0     0.00    0       0.00    0       0.00    0    
#> # … with 5 more variables: UNK_pct <chr>, yy_n <chr>, yy_pct <chr>,
#> #   unit_n <chr>, unit_pct <chr>, and abbreviated variable names
#> #   ¹​`Frequency\n                Percent`, ²​HIANIC_n, ³​HIANIC_pct, ⁴​NATRICAN_n,
#> #   ⁵​NATRICAN_pct

CodePudding user response:

One way is to use tidyr::separate in a for loop:

for(i in names(df[,-1])){
  df <- tidyr::separate(df, i, sep = "\n", into = c(i, paste0(i,"_val")))
}

Output:

# Frequency\n Percent AI  AI_val BLK  BLK_val   HIANIC  HIANIC_val  NATRICAN  NATRICAN_val UNK UNK_val  yy yy_val
# 1           car     2   0.00   0    0.00      1       0.00        9         0.01         15    0.02   111   0.15
# 2           window  3   0.00   218  0.29      8       0.01        7         0.01         83    0.11   897   1.20
# 3           ball    1   0.00   48   0.06      4       0.01        8         0.01         36    0.05   756   1.02
# 4           ups     2   0.00   0    0.00      0       0.00        0         0.00         0     0.00   1     0.00

CodePudding user response:

There is also a base R solution:

dat = structure(list(`Frequency
                       Percent` = c("car", "window", "ball", 
                                    "ups"), AI = c("2\n0.00", "3\n0.00", "1\n0.00", "2\n0.00"), BLK = c("0\n0.00", 
                                                                                                        "218\n0.29", "48\n0.06", "0\n0.00"), HIANIC = c("1\n0.00", "8\n0.01", 
                                                                                                                                                        "4\n0.01", "0\n0.00"), NATRICAN = c("9\n0.01", "7\n0.01", "8\n0.01", 
                                                                                                                                                                                            "0\n0.00"), UNK = c("15\n0.02", "83\n0.11", "36\n0.05", "0\n0.00"
                                                                                                                                                                                            ), yy = c("111\n0.15", "897\n1.20", "756\n1.02", "1\n0.00")), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                                                              -4L))
    transformed = data.frame(Freq_pc = dat[,1])
for(col in seq(2, ncol(dat))){
    transformed = cbind(transformed, t(matrix(unlist(strsplit(dat[,col], "\n")), nrow=2)))
    names(transformed)[c(2*(col-1), 2*(col-1) 1)] = c(paste0(names(dat)[col], "_n"), paste0(names(dat)[col], "_pc"))
}

That results in:

 Freq_pc AI_n AI_pc BLK_n BLK_pc HIANIC_n HIANIC_pc NATRICAN_n NATRICAN_pc UNK_n UNK_pc yy_n yy_pc
1     car    2  0.00     0   0.00        1      0.00          9        0.01    15   0.02  111  0.15
2  window    3  0.00   218   0.29        8      0.01          7        0.01    83   0.11  897  1.20
3    ball    1  0.00    48   0.06        4      0.01          8        0.01    36   0.05  756  1.02
4     ups    2  0.00     0   0.00        0      0.00          0        0.00     0   0.00    1  0.00
  • Related