Bring excel-table in tidy format-CodePudding

I have some struggles converting the following data (from an Excel-sheet) into a tidy format:

input <- structure(list(...11 = c(
  NA, NA, "<1000", ">=1000 and <2000",
  "2000", ">2000 and < 3000", ">=3000"
), ...13 = c(
  "male", "female",
  NA, NA, NA, NA, NA
), ...14 = c(
  "<777", "<555", "0.3", "0.1",
  "0.15", "0.13", "0.15"
), ...15 = c(
  "888-999", "555-999", "0.23",
  "0.21", "0", "0.21", "0.36"
), ...16 = c(
  "556-899", "1020-1170",
  "0.13", "0.29", "0.7", "0.8", "0.2"
), ...17 = c(
  ">960", ">11000",
  "0.58", "0.31", "0.22", "0.65", "0.7"
)), row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))

# A tibble: 7 × 6
  ...11            ...13  ...14 ...15   ...16     ...17 
  <chr>            <chr>  <chr> <chr>   <chr>     <chr> 
1 NA               male   <777  888-999 556-899   >960  
2 NA               female <555  555-999 1020-1170 >11000
3 <1000            NA     0.3   0.23    0.13      0.58  
4 >=1000 and <2000 NA     0.1   0.21    0.29      0.31  
5 2000             NA     0.15  0       0.7       0.22  
6 >2000 and < 3000 NA     0.13  0.21    0.8       0.65  
7 >=3000           NA     0.15  0.36    0.2       0.7

I would like to bring it into the following structure:

   output <- tibble::tribble(
  ~gender, ~x, ~y, ~share,
  "male", "<777", "<1000", 0.3,
  "female", "<555", "<1000", 0.3,
  "male", "<777", ">=1000 and <2000", 0.1,
  "female", "<555", ">=1000 and <2000", 0.1,
)

    # A tibble: 4 × 4
  gender x     y                share
  <chr>  <chr> <chr>            <dbl>
1 male   <777  <1000              0.3
2 female <555  <1000              0.3
3 male   <777  >=1000 and <2000   0.1
4 female <555  >=1000 and <2000   0.1

Any hints are much appreciated!

CodePudding user response：

It's a bit unclear, but I think you'd need to do something like this

df <- input[3:nrow(input),]
input <- input[1:2, 2:3]
t <- input[rep(1:nrow(input), nrow(df)),]
s <- df[rep(1:nrow(df), 2), ]
t <- cbind(t,s)

, and repeat as needed if you need this for multiple columns.

CodePudding user response：

As outlined in the comments, here's a suggested approach:

Import the excel sheet twice using readxl's read_excel using the skip argument:

library(readxl)

df1 <- read_excel(file = "yourfile.xlsx", skip = 2)
df2 <- read_excel(file = "yourfile.xlsx", skip = 1)

That should give you (note X1 might be called ...1):

df1 <- read_table("NA               male   <777  888-999 556-899   >960
<1000            NA     0.3   0.23    0.13      0.58  
>=1000and<2000 NA     0.1   0.21    0.29      0.31  
2000             NA     0.15  0       0.7       0.22  
>2000and<3000 NA     0.13  0.21    0.8       0.65  
>=3000           NA     0.15  0.36    0.2       0.7") 

df2 <- read_table("NA               female <555  555-999 1020-1170 >11000
<1000            NA     0.3   0.23    0.13      0.58  
>=1000and<2000 NA     0.1   0.21    0.29      0.31  
2000             NA     0.15  0       0.7       0.22  
>2000and<3000 NA     0.13  0.21    0.8       0.65  
>=3000           NA     0.15  0.36    0.2       0.7")

Then do a little wrangling; most importantly turn into a long format:

library(dplyr)
library(tidyr)

df1 <- df1 |>
  select(-male) |>
  rename(y = X1) |>
  mutate(gender = "male") |>
  pivot_longer(-c("gender", "y"), names_to = "x", values_to = "share")

df2 <- df2 |>
  select(-female) |>
  rename(y = X1) |>
  mutate(gender = "female") |>
  pivot_longer(-c("gender", "y"), names_to = "x", values_to = "share")

And voila, a tidy frame:

bind_rows(df1, df2) |> arrange(y)

Output:

# A tibble: 40 × 4
   y              gender x         share
   <chr>          <chr>  <chr>     <dbl>
 1 <1000          male   <777       0.3 
 2 <1000          male   888-999    0.23
 3 <1000          male   556-899    0.13
 4 <1000          male   >960       0.58
 5 <1000          female <555       0.3 
 6 <1000          female 555-999    0.23
 7 <1000          female 1020-1170  0.13
 8 <1000          female >11000     0.58
 9 >=1000and<2000 male   <777       0.1 
10 >=1000and<2000 male   888-999    0.21
# … with 30 more rows