Data
Here is the dput
for a subset of the data I'm working with:
crime_state <- structure(list(State = c("ALABAMA", "", "ARIZONA", "", "", "",
"", "", "", "", "", "", "ARKANSAS", "CALIFORNIA", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
""), City = c("HUNTSVILLE4", "TUSCALOOSA", "CHANDLER", "GILBERT",
"GLENDALE", "MESA", "PEORIA", "PHOENIX", "SCOTTSDALE", "SURPRISE",
"TEMPE", "TUCSON", "LITTLE ROCK", "ANAHEIM", "ANTIOCH", "BAKERSFIELD",
"BERKELEY", "BURBANK", "CARLSBAD", "CHULA VISTA", "CLOVIS", "CONCORD",
"CORONA", "COSTA MESA", "DALY CITY", "DOWNEY", "EL CAJON", "EL MONTE",
"ELK GROVE", "ESCONDIDO", "FAIRFIELD", "FONTANA", "FREMONT",
"FRESNO", "FULLERTON", "GARDEN GROVE", "GLENDALE", "HAYWARD",
"HUNTINGTON BEACH", "INGLEWOOD", "IRVINE", "JURUPA VALLEY", "LANCASTER",
"LONG BEACH", "LOS ANGELES", "MODESTO", "MORENO VALLEY", "MURRIETA",
"NORWALK", "OAKLAND"), X = c(2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L
), Population1 = c("196,620", "101,764", "255,986", "247,463",
"249,799", "504,873", "170,177", "1,653,080", "254,961", "136,611",
"188,543", "537,392", "199,288", "354,743", "112,956", "385,609",
"123,735", "105,041", "116,739", "274,370", "111,759", "130,855",
"170,041", "114,358", "107,928", "113,277", "104,497", "116,464",
"174,651", "153,073", "117,883", "213,964", "238,024", "531,818",
"141,132", "174,661", "204,724", "162,881", "203,428", "110,726",
"288,052", "107,605", "160,818", "470,445", "4,029,741", "215,822",
"209,145", "114,706", "106,158", "430,230"), Violent..crime = c("",
"253", "287", "132", "594", "967", "201", "6,118", "211", "67",
"448", "2,027", "1,336", "578", "283", "911", "270", "129", "115",
"406", "106", "246", "127", "170", "113", "174", "246", "181",
"198", "237", "335", "331", "274", "1,489", "174", "266", "83",
"330", "218", "349", "82", "134", "586", "1,702", "14,709", "963",
"396", "49", "219", "2,675"), Murder = c(NA, 3L, 1L, 1L, 7L,
10L, 2L, 67L, 7L, 3L, 3L, 26L, 19L, 2L, 1L, 18L, 1L, 0L, 1L,
4L, 1L, 1L, 4L, 3L, 2L, 3L, 2L, 2L, 0L, 1L, 2L, 5L, 1L, 18L,
0L, 2L, 0L, 0L, 0L, 7L, 0L, 5L, 4L, 11L, 132L, 8L, 7L, 2L, 5L,
32L), Rape2 = c("", "20", "74", "49", "69", "143", "33", "566",
"53", "10", "75", "255", "121", "70", "24", "60", "31", "7",
"19", "43", "23", "27", "27", "35", "18", "7", "20", "12", "16",
"28", "53", "27", "32", "93", "27", "28", "16", "55", "41", "27",
"22", "8", "49", "107", "1,324", "47", "17", "5", "4", "214"),
Robbery = c("", "71", "62", "15", "171", "192", "29", "1,438",
"52", "20", "97", "670", "150", "195", "102", "402", "167",
"48", "23", "114", "15", "103", "49", "59", "45", "83", "86",
"78", "40", "71", "89", "94", "115", "473", "66", "90", "24",
"170", "59", "156", "22", "45", "142", "527", "5,139", "204",
"141", "18", "61", "1,220"), Aggravated..assault = c("",
"159", "150", "67", "347", "622", "137", "4,047", "99", "34",
"273", "1,076", "1,046", "311", "156", "431", "71", "74",
"72", "245", "67", "115", "47", "73", "48", "81", "138",
"89", "142", "137", "191", "205", "126", "905", "81", "146",
"43", "105", "118", "159", "38", "76", "391", "1,057", "8,114",
"704", "231", "24", "149", "1,209"), Property..crime = c("",
"2,092", "2,771", "1,705", "4,830", "5,113", "1,596", "28,854",
"2,853", "1,088", "3,567", "13,925", "6,236", "4,399", "1,479",
"8,330", "2,581", "1,362", "1,081", "1,905", "1,271", "2,151",
"1,535", "1,911", "719", "1,353", "1,164", "1,216", "1,128",
"1,370", "1,639", "1,683", "2,387", "8,628", "1,878", "2,096",
"1,491", "2,434", "1,871", "1,382", "1,670", "1,389", "1,855",
"5,936", "50,093", "3,965", "2,933", "661", "1,057", "10,677"
), Burglary = c("", "414", "369", "214", "756", "751", "281",
"5,465", "350", "138", "456", "1,690", "1,068", "733", "278",
"1,997", "403", "141", "169", "307", "160", "261", "191",
"265", "119", "243", "166", "257", "147", "187", "211", "289",
"404", "1,411", "176", "329", "188", "262", "227", "221",
"276", "201", "512", "1,077", "7,869", "564", "591", "128",
"222", "1,160"), Larceny..theft = c("", "1,569", "2,241",
"1,409", "3,581", "3,901", "1,224", "19,461", "2,362", "860",
"2,841", "10,959", "4,643", "2,969", "910", "4,875", "1,915",
"1,115", "830", "1,244", "1,024", "1,611", "1,082", "1,486",
"517", "785", "827", "677", "892", "933", "1,121", "928",
"1,620", "6,126", "1,460", "1,438", "1,156", "1,468", "1,485",
"822", "1,307", "791", "981", "3,688", "33,364", "2,819",
"1,828", "435", "604", "7,021"), Motor..vehicle..theft = c("",
"109", "161", "82", "493", "461", "91", "3,928", "141", "90",
"270", "1,276", "525", "697", "291", "1,458", "263", "106",
"82", "354", "87", "279", "262", "160", "83", "325", "171",
"282", "89", "250", "307", "466", "363", "1,091", "242",
"329", "147", "704", "159", "339", "87", "397", "362", "1,171",
"8,860", "582", "514", "98", "231", "2,496"), Arson3 = c(NA,
NA, 8L, 5L, 34L, 7L, 4L, 140L, 7L, 3L, 3L, 92L, 24L, 13L,
28L, 116L, 21L, 7L, 5L, 9L, 2L, 12L, 10L, 12L, 3L, 8L, 9L,
10L, 6L, 5L, 11L, 5L, 6L, 129L, 8L, 6L, 5L, 9L, 9L, 7L, 2L,
0L, 14L, 57L, 897L, 26L, 5L, 2L, 4L, 106L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -50L))
Problem
I have data that looks like this, where each state has multiple counties, and thus the state rows have several empty values:
# A tibble: 276 × 14
State City X Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³
<chr> <chr> <int> <chr> <chr> <int> <chr> <chr> <chr>
1 "ALABAMA" HUNTSVI… 2018 196,620 "" NA "" "" ""
2 "" TUSCALO… 2018 101,764 "253" 3 "20" "71" "159"
3 "ARIZONA" CHANDLER 2018 255,986 "287" 1 "74" "62" "150"
4 "" GILBERT 2018 247,463 "132" 1 "49" "15" "67"
5 "" GLENDALE 2018 249,799 "594" 7 "69" "171" "347"
6 "" MESA 2018 504,873 "967" 10 "143" "192" "622"
7 "" PEORIA 2018 170,177 "201" 2 "33" "29" "137"
8 "" PHOENIX 2018 1,653,… "6,118" 67 "566" "1,438" "4,047"
9 "" SCOTTSD… 2018 254,961 "211" 7 "53" "52" "99"
10 "" SURPRISE 2018 136,611 "67" 3 "10" "20" "34"
For this part of the data, Alabama should fill the first two rows, and Arizona should be filling the last 8 rows that are empty. I tried using the methods in this post 6 years ago but using this code (supplying as.character
since that is what most of the data is here):
library(tidyverse)
crime_state %>%
mutate_all(as.character) %>%
fill(names(.),
.direction = "up")
I have no change in the rows I need fixed. Is there a better alternative for my data? This would be the ideal tibble:
# A tibble: 276 × 14
State City X Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 "ALABAMA" HUNTSVI… 2018 196,620 "" 3 "" "" ""
2 "ALABAMA" TUSCALO… 2018 101,764 "253" 3 "20" "71" "159"
3 "ARIZONA" CHANDLER 2018 255,986 "287" 1 "74" "62" "150"
4 "ARIZONA" GILBERT 2018 247,463 "132" 1 "49" "15" "67"
5 "ARIZONA" GLENDALE 2018 249,799 "594" 7 "69" "171" "347"
6 "ARIZONA" MESA 2018 504,873 "967" 10 "143" "192" "622"
7 "ARIZONA" PEORIA 2018 170,177 "201" 2 "33" "29" "137"
8 "ARIZONA" PHOENIX 2018 1,653,… "6,118" 67 "566" "1,438" "4,047"
9 "ARIZONA" SCOTTSD… 2018 254,961 "211" 7 "53" "52" "99"
10 "ARIZONA" SURPRISE 2018 136,611 "67" 3 "10" "20" "34"
CodePudding user response:
What about:
[...]
crime_state |>
dplyr::mutate(State = ifelse(State == "", NA, State)) |>
tidyr::fill(State)
#> # A tibble: 50 × 14
#> State City X Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³ Prope…⁴
#> <chr> <chr> <int> <chr> <chr> <int> <chr> <chr> <chr> <chr>
#> 1 ALABAMA HUNTSVILL… 2018 196,620 "" NA "" "" "" ""
#> 2 ALABAMA TUSCALOOSA 2018 101,764 "253" 3 "20" "71" "159" "2,092"
#> 3 ARIZONA CHANDLER 2018 255,986 "287" 1 "74" "62" "150" "2,771"
#> 4 ARIZONA GILBERT 2018 247,463 "132" 1 "49" "15" "67" "1,705"
#> 5 ARIZONA GLENDALE 2018 249,799 "594" 7 "69" "171" "347" "4,830"
#> 6 ARIZONA MESA 2018 504,873 "967" 10 "143" "192" "622" "5,113"
#> 7 ARIZONA PEORIA 2018 170,177 "201" 2 "33" "29" "137" "1,596"
#> 8 ARIZONA PHOENIX 2018 1,653,… "6,118" 67 "566" "1,438" "4,047" "28,85…
#> 9 ARIZONA SCOTTSDALE 2018 254,961 "211" 7 "53" "52" "99" "2,853"
#> 10 ARIZONA SURPRISE 2018 136,611 "67" 3 "10" "20" "34" "1,088"
#> # … with 40 more rows, 4 more variables: Burglary <chr>, Larceny..theft <chr>,
#> # Motor..vehicle..theft <chr>, Arson3 <int>, and abbreviated variable names
#> # ¹Population1, ²Violent..crime, ³Aggravated..assault, ⁴Property..crime
Created on 2022-10-22 with reprex v2.0.2
CodePudding user response:
I suggest a simplfication from @Grzegorz great approach.
This is the case for dplyr::na_if
:
library(dplyr)
library(tidyr)
library(tidyverse)
crime_state %>%
mutate(State = na_if(State, '')) %>%
fill(State)
A tibble: 50 × 14
State City X Population1 Violent..crime Murder Rape2 Robbery Aggravated..assault Property..crime Burglary Larceny..theft Motor..vehicle..t…¹ Arson3
<chr> <chr> <int> <chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int>
1 ALABAMA HUNTSVILLE4 2018 196,620 "" NA "" "" "" "" "" "" "" NA
2 ALABAMA TUSCALOOSA 2018 101,764 "253" 3 "20" "71" "159" "2,092" "414" "1,569" "109" NA
3 ARIZONA CHANDLER 2018 255,986 "287" 1 "74" "62" "150" "2,771" "369" "2,241" "161" 8
4 ARIZONA GILBERT 2018 247,463 "132" 1 "49" "15" "67" "1,705" "214" "1,409" "82" 5
5 ARIZONA GLENDALE 2018 249,799 "594" 7 "69" "171" "347" "4,830" "756" "3,581" "493" 34
6 ARIZONA MESA 2018 504,873 "967" 10 "143" "192" "622" "5,113" "751" "3,901" "461" 7
7 ARIZONA PEORIA 2018 170,177 "201" 2 "33" "29" "137" "1,596" "281" "1,224" "91" 4
8 ARIZONA PHOENIX 2018 1,653,080 "6,118" 67 "566" "1,438" "4,047" "28,854" "5,465" "19,461" "3,928" 140
9 ARIZONA SCOTTSDALE 2018 254,961 "211" 7 "53" "52" "99" "2,853" "350" "2,362" "141" 7
10 ARIZONA SURPRISE 2018 136,611 "67" 3 "10" "20" "34" "1,088" "138" "860" "90" 3
# … with 40 more rows, and abbreviated variable name ¹Motor..vehicle..theft
# ℹ Use `print(n = ...)` to see more rows