I have election data that looks mostly like this:
t1 t2 t3 t4
<chr> <chr> <chr> <chr>
1 CONTEST 1 NA NA NA
2 Precinct Candidate 1 Candidate 2 Candidate 3
3 PE01 1 4 2
4 PE02 5 3 2
5 CONTEST 2 NA NA NA
6 Precinct Candidate 1 Candidate 2 Candidate 3
7 PE01 2 1 5
8 PE02 4 2 3
In order the pivot the data, I’ve created a new variable contest where I move over the contest name and fill
down, group_by
contest, and group_split
. Then map_dfr
a function to get rid of the first row, move the 2nd (now 1st) row to variable names (super easy with janitor
's row_to_names
), and pivot_longer
.
library(dplyr)
library(tidyr)
library(janitor)
library(purrr)
t_lst <- t %>%
mutate(contest = if_else(str_detect(t1, " "), t1, NA_character_), .before = t1) %>%
fill(contest, .direction = "down") %>%
group_by(contest) %>%
group_split()
e_pivot <- function(x) {
x %>%
slice(-1) %>%
row_to_names(row_number = 1) %>%
pivot_longer(cols = 3:last_col(), names_to = "candidate", values_to = "votes") %>%
rename(contest = 1, precinct = Precinct) %>%
mutate(votes = as.numeric(votes))
}
map_dfr(t_lst, e_pivot)
Works great, I get this and can now actually use the data.
contest precinct candidate votes
<chr> <chr> <chr> <dbl>
1 CONTEST 1 PE01 Candidate 1 1
2 CONTEST 1 PE01 Candidate 2 4
3 CONTEST 1 PE01 Candidate 3 2
4 CONTEST 1 PE02 Candidate 1 5
5 CONTEST 1 PE02 Candidate 2 3
6 CONTEST 1 PE02 Candidate 3 2
7 CONTEST 2 PE01 Candidate 1 2
8 CONTEST 2 PE01 Candidate 2 1
9 CONTEST 2 PE01 Candidate 3 5
10 CONTEST 2 PE02 Candidate 1 4
11 CONTEST 2 PE02 Candidate 2 2
12 CONTEST 2 PE02 Candidate 3 3
BUT I’ve realized for some of the contests, there are so many candidates that the contests repeat, so the data looks like this:
t1 t2 t3 t4
<chr> <chr> <chr> <chr>
1 CONTEST 1 NA NA NA
2 Precinct Candidate 1 Candidate 2 Candidate 3
3 PE01 1 4 2
4 PE02 5 3 2
5 CONTEST 1 NA NA NA
6 Precinct Candidate 4 Candidate 5 Candidate 6
7 PE01 2 1 5
8 PE02 4 2 3
At which point my group-split-pivot-combine approach doesn’t work.
I don’t need the data to be split by contest per se, just by where the candidate names are so I can pivot correctly. So long as I have each candidate identified with the right contest in the final output all is good. I ended up using the group_by
contest because I needed the contest as a variable regardless and it was a straightforward approach.
I’m struggling to figure out a different way to identify where to split the data. My thought is to use the same str_detect
a space to identify where to split, but my searches online have not led me to figure out how exactly to do that. Many questions on this site seeming to ask how to split by value get answers that aren't applicable here (often because they didn't need to split, or the answer uses group_split
). I also found some great answers for pandas data.frames in python which similarly, do not apply here.
dput of sample of the actual data (the decimals are there because the election uses cumulative voting):
structure(list(V1 = c("FOR CITY COUNCIL AT LARGE", "Precinct",
"PE01", "PE02", "PE03", "PE04", "PE05", "PE06", "PE07", "PE08",
"PE09", "PE10", "PE11", "PE12", "PE13", "PE14", "PE15", "PE16",
"PE17", "PE18", "PE19", "PE20", "PE21", "PE22", "PE23", "PE24",
"PE25", "PE26", "PE27", "PE28", "PE29", "PE30", "PE31", "PE32",
"PE33", "PE34", "PE35", "PE36", "PE37", "PE38", "PE39", "PE40",
"PE41", "PE42", "PE43", "PE44", "PE45", "PE46", "PE47", "PE48",
"PE49", "PE50", "FOR CITY COUNCIL AT LARGE", "Precinct", "PE01",
"PE02", "PE03", "PE04", "PE05", "PE06", "PE07", "PE08", "PE09",
"PE10", "PE11", "PE12", "PE13", "PE14", "PE15", "PE16", "PE17",
"PE18", "PE19", "PE20", "PE21", "PE22", "PE23", "PE24", "PE25",
"PE26", "PE27", "PE28", "PE29", "PE30", "PE31", "PE32", "PE33",
"PE34", "PE35", "PE36", "PE37", "PE38", "PE39", "PE40", "PE41",
"PE42", "PE43", "PE44", "PE45", "PE46", "PE47", "PE48", "PE49",
"PE50"), V8 = c(NA, "Beth Jensen", "4.5000", "2.0000", "4.0000",
"18.2500", "1.0000", "21.1667", "30.2500", "10.5833", "8.7500",
"11.8333", "10.5000", "48.6667", "14.2500", "18.1667", "81.8333",
"35.3333", "52.0833", "143.1667", "155.1667", "136.7500", "17.5000",
"14.2500", "12.1667", "4.9167", "24.6667", "34.8333", "108.6667",
"44.5000", "80.5000", "64.6667", "18.7500", "4.0000", "36.4167",
"44.5000", "12.6667", "36.5000", "19.9167", "74.4167", "19.8333",
"47.0000", "25.4167", "26.2500", "58.0000", "49.0833", "79.6667",
"47.2500", "36.3333", "129.2500", "61.4167", "65.9167", NA, "John L. Kelly",
"6.2500", "0.0000", "2.0000", "2.5000", "1.0000", "6.0000", "36.9167",
"3.3333", "0.0000", "5.0000", "0.0000", "18.6667", "7.6667",
"2.0000", "21.2500", "4.1667", "3.0000", "8.1667", "26.0000",
"26.5000", "0.0000", "7.5000", "16.5833", "2.6667", "3.6667",
"22.3333", "64.0000", "10.0000", "29.8333", "48.8333", "6.5000",
"14.6667", "5.5000", "6.7500", "0.0000", "20.7500", "1.0000",
"85.5833", "8.0000", "34.7500", "10.1667", "7.3333", "31.2500",
"29.2500", "57.0833", "60.3333", "30.5000", "66.5000", "31.9167",
"39.2500"), V9 = c(NA, "Zachary M. Oyler", "1.6667", "1.0000",
"7.1667", "10.5000", "7.0000", "1.0000", "13.5833", "5.8333",
"7.0000", "4.6667", "25.0000", "39.0833", "4.7500", "11.6667",
"35.2500", "9.6667", "8.1667", "23.0833", "20.0000", "32.8333",
"3.9167", "42.4167", "33.5000", "3.6667", "40.1667", "50.8333",
"79.1667", "58.6667", "123.0833", "137.2500", "24.3333", "5.5000",
"13.3333", "6.6667", "6.0000", "6.4167", "5.9167", "48.5000",
"19.5000", "15.5000", "24.6667", "26.3333", "60.1667", "34.5833",
"46.5000", "27.3333", "57.5000", "104.0000", "40.5000", "95.3333",
NA, "Beth Akeson", "10.0000", "1.0000", "1.0000", "5.5000", "1.0000",
"2.2500", "9.0000", "3.7500", "1.0000", "1.0000", "1.6667", "16.3333",
"18.0000", "4.5000", "25.9167", "6.0000", "14.0000", "16.2500",
"19.6667", "23.0833", "2.0000", "2.0000", "3.6667", "3.6667",
"12.6667", "11.0000", "72.0833", "10.8333", "46.2500", "12.5000",
"6.9167", "3.5000", "8.8333", "15.6667", "3.9167", "5.0000",
"5.7500", "82.0000", "10.0000", "16.7500", "17.4167", "19.6667",
"33.7500", "33.5833", "41.2500", "18.1667", "29.3333", "63.2500",
"30.3333", "20.6667"), V10 = c(NA, "Rita Ali", "79.9167", "100.5000",
"65.7500", "183.5000", "63.5000", "80.0833", "166.5000", "128.2500",
"253.5833", "222.7500", "292.7500", "125.5833", "68.6667", "68.0000",
"93.9167", "98.4167", "71.0000", "41.2500", "24.8333", "91.7500",
"92.9167", "141.1667", "118.4167", "125.5000", "108.3333", "100.6667",
"93.2500", "102.5833", "88.6667", "96.8333", "94.6667", "63.3333",
"102.9167", "86.0833", "80.7500", "87.0833", "39.3333", "73.1667",
"29.3333", "117.5833", "108.5833", "89.6667", "175.9167", "90.8333",
"81.1667", "100.0000", "58.7500", "96.7500", "88.5000", "105.5000",
NA, "Aaron T. Chess Jr.", "4.5000", "11.0000", "10.6667", "8.5000",
"35.0000", "19.3333", "28.0000", "6.4167", "20.8333", "12.2500",
"21.4167", "5.3333", "11.1667", "2.0000", "1.0000", "8.6667",
"0.0000", "2.5000", "2.0000", "2.2500", "11.6667", "3.5000",
"9.2500", "10.3333", "17.5000", "11.1667", "5.5000", "3.9167",
"1.0000", "2.2500", "5.0000", "5.1667", "9.3333", "1.0000", "12.0000",
"12.6667", "7.6667", "0.0000", "3.0000", "11.0000", "11.1667",
"16.5000", "1.0000", "8.9167", "7.3333", "1.0000", "2.0000",
"5.0000", "5.7500", "7.9167"), V11 = c(NA, "Sid Ruckriegel",
"1.6667", "3.0000", "3.0000", "7.0000", "6.0000", "2.6667", "40.2500",
"7.6667", "7.0000", "3.6667", "10.0000", "46.7500", "7.6667",
"25.6667", "30.2500", "34.5833", "20.0000", "79.0000", "42.0000",
"62.0833", "6.2500", "7.4167", "28.9167", "4.3333", "27.0000",
"21.0000", "81.1667", "26.0000", "53.8333", "56.5000", "12.3333",
"6.5000", "31.8333", "20.6667", "3.5000", "11.0833", "18.2500",
"98.8333", "19.1667", "34.7500", "20.6667", "20.3333", "41.5000",
"43.9167", "64.8333", "57.5833", "45.8333", "120.1667", "35.0000",
"44.9167", NA, "Kevin R. Sibley", "5.0000", "0.0000", "0.0000",
"0.0000", "2.0000", "2.0000", "1.0000", "0.0000", "1.0000", "2.0000",
"1.0000", "2.0000", "2.0000", "7.0000", "6.0000", "5.0000", "1.0000",
"0.0000", "0.0000", "20.0000", "0.0000", "10.0000", "0.0000",
"6.2500", "7.0000", "3.0000", "6.4167", "6.6667", "3.5000", "16.0000",
"0.0000", "10.0000", "6.0000", "4.5000", "1.0000", "3.2500",
"2.6667", "21.0000", "3.6667", "27.2500", "0.0000", "10.0000",
"2.6667", "2.0000", "2.0000", "4.7500", "12.6667", "7.0000",
"59.7500", "7.1667"), V12 = c(NA, "Peter Kobak", "4.7500", "1.0000",
"3.9167", "3.5000", "1.0000", "10.5833", "40.7500", "2.6667",
"3.7500", "11.9167", "27.0000", "31.8333", "5.9167", "23.5000",
"76.4167", "76.0000", "74.1667", "51.0833", "104.6667", "163.7500",
"17.8333", "0.0000", "21.5833", "16.1667", "20.7500", "44.6667",
"64.9167", "54.2500", "52.4167", "32.5833", "23.9167", "14.5000",
"42.4167", "19.5833", "6.2500", "20.9167", "12.0000", "50.2500",
"26.3333", "60.1667", "9.5833", "20.8333", "44.1667", "36.4167",
"26.6667", "12.7500", "32.4167", "61.3333", "50.5000", "55.8333",
NA, "James B. Golden", "2.5000", "0.0000", "3.5000", "0.0000",
"5.0000", "1.0000", "10.0000", "1.0000", "2.0000", "1.0000",
"0.0000", "5.3333", "0.0000", "0.0000", "4.1667", "1.6667", "7.1667",
"0.0000", "0.0000", "6.6667", "0.0000", "6.0000", "0.0000", "0.0000",
"1.2500", "2.0000", "6.1667", "2.0000", "1.0000", "3.0000", "1.0000",
"1.0000", "5.5000", "2.0000", "2.0000", "2.0000", "0.0000", "2.0000",
"1.0000", "2.0000", "0.0000", "1.0000", "1.0000", "0.0000", "7.2500",
"9.0000", "3.2500", "0.0000", "3.0000", "6.6667"), V13 = c(NA,
"Branden Martin", "10.0000", "0.0000", "1.0000", "9.5000", "0.0000",
"7.0000", "4.9167", "1.6667", "0.0000", "1.0000", "3.0000", "34.8333",
"6.7500", "6.0000", "15.9167", "16.5000", "7.8333", "7.2500",
"7.0000", "5.0000", "0.0000", "1.0000", "4.7500", "0.0000", "3.6667",
"9.1667", "35.9167", "14.3333", "32.8333", "27.7500", "10.3333",
"5.5000", "5.0000", "9.0000", "1.0000", "11.7500", "5.3333",
"18.8333", "9.6667", "18.1667", "27.8333", "20.2500", "39.5833",
"15.3333", "48.1667", "26.5000", "37.2500", "41.0000", "50.1667",
"10.9167", NA, "Mary Nell Hayes", "1.2500", "0.0000", "6.0000",
"2.2500", "2.0000", "2.2500", "13.6667", "0.0000", "8.0000",
"6.0000", "1.6667", "5.1667", "2.5000", "0.0000", "5.0000", "2.0000",
"2.6667", "5.2500", "1.0000", "0.0000", "0.0000", "0.0000", "2.0000",
"0.0000", "2.0000", "0.0000", "11.6667", "1.0000", "1.0000",
"5.1667", "2.0000", "2.0000", "5.6667", "8.6667", "1.0000", "8.8333",
"1.0000", "1.0000", "2.0000", "5.6667", "1.2500", "0.0000", "1.0000",
"1.0000", "4.5000", "5.1667", "2.0000", "2.9167", "2.0000", "6.0000"
), V14 = c(NA, "Janice Louise Lindsay", "1.0000", "7.0000", "16.4167",
"13.7500", "2.0000", "16.4167", "5.0000", "5.8333", "3.0000",
"13.1667", "11.4167", "9.0000", "13.6667", "7.0000", "9.5000",
"5.2500", "4.2500", "0.0000", "0.0000", "1.0000", "5.5000", "6.6667",
"3.2500", "4.3333", "12.1667", "2.0000", "4.2500", "8.6667",
"2.0000", "9.7500", "16.5833", "1.0000", "7.1667", "7.6667",
"0.0000", "3.6667", "4.5000", "2.2500", "2.0000", "3.0000", "6.6667",
"2.0000", "8.0000", "2.5000", "1.2500", "9.0000", "4.5000", "5.9167",
"4.5000", "9.3333", NA, "Andre W. Allen", "17.0000", "18.5000",
"27.5833", "8.0000", "8.5000", "17.2500", "43.1667", "17.0000",
"14.0833", "27.7500", "38.5833", "20.4167", "31.0000", "8.5000",
"26.5833", "14.7500", "16.1667", "7.0000", "16.6667", "38.5833",
"26.4167", "28.0833", "39.9167", "52.1667", "52.5000", "51.3333",
"28.6667", "29.5833", "52.4167", "48.9167", "12.6667", "10.3333",
"28.0833", "35.2500", "36.4167", "25.0833", "5.0000", "33.0000",
"21.5000", "33.4167", "61.5833", "23.8333", "70.7500", "17.5833",
"23.3333", "33.1667", "20.1667", "35.2500", "36.1667", "42.0833"
)), row.names = c(NA, -104L), class = c("tbl_df", "tbl", "data.frame"
))
CodePudding user response:
You can use is.na()
on the second column to detect where the new blocks starts. Then cumsum(is.na())
will give you an id for each block. Group on this, not the contest name.
e_pivot <- function(x) {
x %>%
select(-block_id) %>%
slice(-1) %>%
row_to_names(row_number = 1) %>%
pivot_longer(cols = 3:last_col(), names_to = "candidate", values_to = "votes") %>%
rename(contest = 1, precinct = Precinct) %>%
mutate(votes = as.numeric(votes))
}
t %>%
mutate(contest = if_else(str_detect(V1, " "), V1, NA_character_),,
block_id = cumsum(is.na(V8)),
.before = V1) %>%
fill(contest, .direction = "down") %>%
group_by(block_id) %>%
group_split() %>%
map_dfr(e_pivot)
CodePudding user response:
This approach counts the occurrences of "FOR CITY COUNCIL AT LARGE" and should work regardless of the number of candidates you have:
library(tidyverse)
# create a list to store the results
list_of_dfs <- list()
# For however many times "FOR CITY COUNCIL AT LARGE" occurs,
# increment a 'counter' and use it to subset the original dataframe
for (i in seq_along(grep("FOR CITY COUNCIL AT LARGE", df$V1))) {
list_of_dfs[[i]] <- df[cumsum(grepl(pattern = "FOR CITY COUNCIL AT LARGE", df$V1)) == i,]
}
# Then bind the subsets together by column
t <- bind_cols(list_of_dfs, .name_repair = "minimal")
# Remove the duplicated "t1" column
t2 <- t[!duplicated(as.list(t))]
# And rename the cols
names(t2) <- paste0("t", 1:ncol(t2))
# Then, format/pivot the data as 'normal'
library(janitor)
library(purrr)
t_lst <- t2 %>%
mutate(contest = if_else(str_detect(t1, " "), t1, NA_character_), .before = t1) %>%
fill(contest, .direction = "down") %>%
group_by(contest) %>%
group_split()
e_pivot <- function(x) {
x %>%
slice(-1) %>%
row_to_names(row_number = 1) %>%
pivot_longer(cols = 3:last_col(), names_to = "candidate", values_to = "votes") %>%
rename(contest = 1, precinct = Precinct) %>%
mutate(votes = as.numeric(votes))
}
map_dfr(t_lst, e_pivot)
#> # A tibble: 700 × 4
#> contest precinct candidate votes
#> <chr> <chr> <chr> <dbl>
#> 1 FOR CITY COUNCIL AT LARGE PE01 Beth Jensen 4.5
#> 2 FOR CITY COUNCIL AT LARGE PE01 Zachary M. Oyler 1.67
#> 3 FOR CITY COUNCIL AT LARGE PE01 Rita Ali 79.9
#> 4 FOR CITY COUNCIL AT LARGE PE01 Sid Ruckriegel 1.67
#> 5 FOR CITY COUNCIL AT LARGE PE01 Peter Kobak 4.75
#> 6 FOR CITY COUNCIL AT LARGE PE01 Branden Martin 10
#> 7 FOR CITY COUNCIL AT LARGE PE01 Janice Louise Lindsay 1
#> 8 FOR CITY COUNCIL AT LARGE PE01 John L. Kelly 6.25
#> 9 FOR CITY COUNCIL AT LARGE PE01 Beth Akeson 10
#> 10 FOR CITY COUNCIL AT LARGE PE01 Aaron T. Chess Jr. 4.5
#> # … with 690 more rows
#> # ℹ Use `print(n = ...)` to see more rows
# This approach should work no matter how many candidates there are,
# so wrap everything in a function to make it 'easier' to use:
format_results <- function(df) {
# create a list to store the results
list_of_dfs <- list()
# For however many times "FOR CITY COUNCIL AT LARGE" occurs,
# increment a 'counter' and use it to subset the original dataframe
for (i in seq_along(grep("FOR CITY COUNCIL AT LARGE", df$V1))) {
list_of_dfs[[i]] <- df[cumsum(grepl(pattern = "FOR CITY COUNCIL AT LARGE", df$V1)) == i,]
}
# Then bind the subsets together by column
t <- bind_cols(list_of_dfs, .name_repair = "minimal")
# Remove the duplicated "t1" column
t2 <- t[!duplicated(as.list(t))]
# And rename the cols
names(t2) <- paste0("t", 1:ncol(t2))
# Then, format/pivot the data as 'normal'
library(janitor)
library(purrr)
t_lst <- t2 %>%
mutate(contest = if_else(str_detect(t1, " "), t1, NA_character_), .before = t1) %>%
fill(contest, .direction = "down") %>%
group_by(contest) %>%
group_split()
e_pivot <- function(x) {
x %>%
slice(-1) %>%
row_to_names(row_number = 1) %>%
pivot_longer(cols = 3:last_col(), names_to = "candidate", values_to = "votes") %>%
rename(contest = 1, precinct = Precinct) %>%
mutate(votes = as.numeric(votes))
}
return(map_dfr(t_lst, e_pivot))
}
format_results(df)
#> # A tibble: 700 × 4
#> contest precinct candidate votes
#> <chr> <chr> <chr> <dbl>
#> 1 FOR CITY COUNCIL AT LARGE PE01 Beth Jensen 4.5
#> 2 FOR CITY COUNCIL AT LARGE PE01 Zachary M. Oyler 1.67
#> 3 FOR CITY COUNCIL AT LARGE PE01 Rita Ali 79.9
#> 4 FOR CITY COUNCIL AT LARGE PE01 Sid Ruckriegel 1.67
#> 5 FOR CITY COUNCIL AT LARGE PE01 Peter Kobak 4.75
#> 6 FOR CITY COUNCIL AT LARGE PE01 Branden Martin 10
#> 7 FOR CITY COUNCIL AT LARGE PE01 Janice Louise Lindsay 1
#> 8 FOR CITY COUNCIL AT LARGE PE01 John L. Kelly 6.25
#> 9 FOR CITY COUNCIL AT LARGE PE01 Beth Akeson 10
#> 10 FOR CITY COUNCIL AT LARGE PE01 Aaron T. Chess Jr. 4.5
#> # … with 690 more rows
#> # ℹ Use `print(n = ...)` to see more rows
Created on 2022-08-29 by the reprex package (v2.0.1)
CodePudding user response:
library(tidyverse)
df %>%
rename_with(~str_c('t', seq_along(.))) %>%
mutate(CONTEST = ifelse(if_all(-1,is.na), t1, NA),
Precinct = ifelse(is.na(CONTEST),t1, NA),
t1 = NULL, .before=1) %>%
fill(CONTEST) %>%
filter(!if_all(-1,is.na))%>%
pivot_longer(-(1:2)) %>%
mutate(grouper = str_detect(value, "^\\D")) %>%
group_by(grp = cumsum(c(1, diff(grouper)) ==1)) %>%
mutate(name = setNames(value[grouper], name[grouper])[name]) %>%
filter(!grouper) %>%
mutate(value = as.numeric(value), grouper = NULL)
# A tibble: 12 x 5
# Groups: grp [2]
CONTEST Precinct name value grp
<chr> <chr> <chr> <dbl> <int>
1 CONTEST 1 PE01 Candidate 1 1 1
2 CONTEST 1 PE01 Candidate 2 4 1
3 CONTEST 1 PE01 Candidate 3 2 1
4 CONTEST 1 PE02 Candidate 1 5 1
5 CONTEST 1 PE02 Candidate 2 3 1
6 CONTEST 1 PE02 Candidate 3 2 1
7 CONTEST 1 PE01 Candidate 4 2 2
8 CONTEST 1 PE01 Candidate 5 1 2
9 CONTEST 1 PE01 Candidate 6 5 2
10 CONTEST 1 PE02 Candidate 4 4 2
11 CONTEST 1 PE02 Candidate 5 2 2
12 CONTEST 1 PE02 Candidate 6 3 2