I have a table that's stored in a single column of a data frame. I want to convert that single column into a data frame with the original column names.
> head(Table)
# A tibble: 6 x 1
table
<chr>
1 "Characteristic Urban Rural Total"
2 "_________________________________________________"
3 "Electricity"
4 " Yes 99.8 94.4 98.9"
5 " No 0.2 5.6 1.1"
6 "Total 100.0 100.0 100.0"
I want to convert this into a data frame with the same columns, Characteristic, Urban, Rural, Total.
The only time the integers occur are in the last 3 columns. I'm not sure if I can use that to separate the columns?
> dput(Table)
structure(list(table = c("Characteristic Urban Rural Total",
"_________________________________________________", "Electricity",
" Yes 99.8 94.4 98.9", " No 0.2 5.6 1.1",
"Total 100.0 100.0 100.0", "Source of drinking water",
" Piped into residence 97.1 81.4 94.4", " Public tap 0.0 0.3 0.1",
" Well in residence 1.1 3.7 1.6", " Public well 0.0 0.4 0.1",
" Spring 0.0 2.3 0.4", " Rainwater 0.3 0.6 0.3",
" Tanker truck 1.0 10.6 2.7", " Bottled water 0.3 0.1 0.3",
" Other 0.1 0.5 0.2", "Total 100.0 100.0 100.0",
"Time to water source", " (in minutes)", " <15 minutes 99.9 97.5 99.5",
"Type of salt used", " for cooking", " Salt not used 0.1 0.1 0.1",
" Packaged salt, iodized 95.8 91.3 95.0", " Packaged salt, noniodized 3.8 8.4 4.6",
" Other 0.2 0.2 0.2", "Total 100.0 100.0 100.0",
"Sanitation facility", " Own flush toilet 94.0 70.8 90.1",
" Shared flush toilet 1.6 2.4 1.8", " Traditional pit toilet 4.2 24.0 7.5",
" No facility 0.1 2.8 0.6", "Total 100.0 100.0 100.0",
"Type of sewage system", " Public network 70.5 4.4 59.2",
" Dug hole 29.3 92.4 40.1", " No sewage 0.1 3.1 0.6",
" Missing 0.1 0.0 0.1", "Total 100.0 100.0 100.0",
"Main floor material", " Earth/sand 0.1 1.4 0.3",
" Wood planks 0.0 0.1 0.0", " Vinyl/asphalt strips 0.6 0.3 0.5",
" Ceramic tiles 86.2 61.8 82.1", " Cement 13.1 36.3 17.0",
"Total 100.0 100.0 100.0", "Persons per sleeping room",
" 1-2 50.4 40.6 48.7", " 3-4 39.1 41.8 39.5",
" 5-6 8.4 13.0 9.2", " 7 2.1 4.7 2.5",
" Missing/Don't know 0.1 0.0 0.1", "Total 100.0 100.0 100.0",
"Mean 3.0 3.4 3.0", "Number of households 6,086 1,249 7,335",
"", "", "", "", " 14", "")), row.names = c(NA,
-61L), class = c("tbl_df", "tbl", "data.frame"))
CodePudding user response:
Using strsplit
.
tmp <- lapply(strsplit(dat$V2, '\\s ')[c(1, 4:6)], \(x) x[x != ''])
(r <- setNames(do.call(rbind.data.frame, tmp[-1]), tmp[[1]]) |>
type.convert(as.is=T))
# Characteristic Urban Rural Total
# 1 Yes 99.8 94.4 98.9
# 2 No 0.2 5.6 1.1
# 3 Total 100.0 100.0 100.0
Note: R >= 4.1 used.
Data:
dat <- structure(list(V2 = c("Characteristic Urban Rural Total",
"_________________________________________________", "Electricity",
" Yes 99.8 94.4 98.9", " No 0.2 5.6 1.1",
"Total 100.0 100.0 100.0")), row.names = c(NA,
-6L), class = "data.frame")
CodePudding user response:
A possible solution, based on tidyverse
:
library(tidyverse)
df %>%
filter(table != "_________________________________________________" ) %>%
mutate(table = str_trim(table)) %>%
separate(table, sep = "\\s (?=\\d )",
into = c("Characteristic", "Urban", "Rural", "Total"), fill = "right") %>%
filter(Characteristic != "") %>%
slice(-1)
#> # A tibble: 54 × 4
#> Characteristic Urban Rural Total
#> <chr> <chr> <chr> <chr>
#> 1 Electricity <NA> <NA> <NA>
#> 2 Yes 99.8 94.4 98.9
#> 3 No 0.2 5.6 1.1
#> 4 Total 100.0 100.0 100.0
#> 5 Source of drinking water <NA> <NA> <NA>
#> 6 Piped into residence 97.1 81.4 94.4
#> 7 Public tap 0.0 0.3 0.1
#> 8 Well in residence 1.1 3.7 1.6
#> 9 Public well 0.0 0.4 0.1
#> 10 Spring 0.0 2.3 0.4
#> # … with 44 more rows