Home > database >  How to fill empty rows using first text value in each row in R?
How to fill empty rows using first text value in each row in R?

Time:10-23

Data

Here is the dput for a subset of the data I'm working with:

crime_state <- structure(list(State = c("ALABAMA", "", "ARIZONA", "", "", "", 
"", "", "", "", "", "", "ARKANSAS", "CALIFORNIA", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
""), City = c("HUNTSVILLE4", "TUSCALOOSA", "CHANDLER", "GILBERT", 
"GLENDALE", "MESA", "PEORIA", "PHOENIX", "SCOTTSDALE", "SURPRISE", 
"TEMPE", "TUCSON", "LITTLE ROCK", "ANAHEIM", "ANTIOCH", "BAKERSFIELD", 
"BERKELEY", "BURBANK", "CARLSBAD", "CHULA VISTA", "CLOVIS", "CONCORD", 
"CORONA", "COSTA MESA", "DALY CITY", "DOWNEY", "EL CAJON", "EL MONTE", 
"ELK GROVE", "ESCONDIDO", "FAIRFIELD", "FONTANA", "FREMONT", 
"FRESNO", "FULLERTON", "GARDEN GROVE", "GLENDALE", "HAYWARD", 
"HUNTINGTON BEACH", "INGLEWOOD", "IRVINE", "JURUPA VALLEY", "LANCASTER", 
"LONG BEACH", "LOS ANGELES", "MODESTO", "MORENO VALLEY", "MURRIETA", 
"NORWALK", "OAKLAND"), X = c(2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L
), Population1 = c("196,620", "101,764", "255,986", "247,463", 
"249,799", "504,873", "170,177", "1,653,080", "254,961", "136,611", 
"188,543", "537,392", "199,288", "354,743", "112,956", "385,609", 
"123,735", "105,041", "116,739", "274,370", "111,759", "130,855", 
"170,041", "114,358", "107,928", "113,277", "104,497", "116,464", 
"174,651", "153,073", "117,883", "213,964", "238,024", "531,818", 
"141,132", "174,661", "204,724", "162,881", "203,428", "110,726", 
"288,052", "107,605", "160,818", "470,445", "4,029,741", "215,822", 
"209,145", "114,706", "106,158", "430,230"), Violent..crime = c("", 
"253", "287", "132", "594", "967", "201", "6,118", "211", "67", 
"448", "2,027", "1,336", "578", "283", "911", "270", "129", "115", 
"406", "106", "246", "127", "170", "113", "174", "246", "181", 
"198", "237", "335", "331", "274", "1,489", "174", "266", "83", 
"330", "218", "349", "82", "134", "586", "1,702", "14,709", "963", 
"396", "49", "219", "2,675"), Murder = c(NA, 3L, 1L, 1L, 7L, 
10L, 2L, 67L, 7L, 3L, 3L, 26L, 19L, 2L, 1L, 18L, 1L, 0L, 1L, 
4L, 1L, 1L, 4L, 3L, 2L, 3L, 2L, 2L, 0L, 1L, 2L, 5L, 1L, 18L, 
0L, 2L, 0L, 0L, 0L, 7L, 0L, 5L, 4L, 11L, 132L, 8L, 7L, 2L, 5L, 
32L), Rape2 = c("", "20", "74", "49", "69", "143", "33", "566", 
"53", "10", "75", "255", "121", "70", "24", "60", "31", "7", 
"19", "43", "23", "27", "27", "35", "18", "7", "20", "12", "16", 
"28", "53", "27", "32", "93", "27", "28", "16", "55", "41", "27", 
"22", "8", "49", "107", "1,324", "47", "17", "5", "4", "214"), 
    Robbery = c("", "71", "62", "15", "171", "192", "29", "1,438", 
    "52", "20", "97", "670", "150", "195", "102", "402", "167", 
    "48", "23", "114", "15", "103", "49", "59", "45", "83", "86", 
    "78", "40", "71", "89", "94", "115", "473", "66", "90", "24", 
    "170", "59", "156", "22", "45", "142", "527", "5,139", "204", 
    "141", "18", "61", "1,220"), Aggravated..assault = c("", 
    "159", "150", "67", "347", "622", "137", "4,047", "99", "34", 
    "273", "1,076", "1,046", "311", "156", "431", "71", "74", 
    "72", "245", "67", "115", "47", "73", "48", "81", "138", 
    "89", "142", "137", "191", "205", "126", "905", "81", "146", 
    "43", "105", "118", "159", "38", "76", "391", "1,057", "8,114", 
    "704", "231", "24", "149", "1,209"), Property..crime = c("", 
    "2,092", "2,771", "1,705", "4,830", "5,113", "1,596", "28,854", 
    "2,853", "1,088", "3,567", "13,925", "6,236", "4,399", "1,479", 
    "8,330", "2,581", "1,362", "1,081", "1,905", "1,271", "2,151", 
    "1,535", "1,911", "719", "1,353", "1,164", "1,216", "1,128", 
    "1,370", "1,639", "1,683", "2,387", "8,628", "1,878", "2,096", 
    "1,491", "2,434", "1,871", "1,382", "1,670", "1,389", "1,855", 
    "5,936", "50,093", "3,965", "2,933", "661", "1,057", "10,677"
    ), Burglary = c("", "414", "369", "214", "756", "751", "281", 
    "5,465", "350", "138", "456", "1,690", "1,068", "733", "278", 
    "1,997", "403", "141", "169", "307", "160", "261", "191", 
    "265", "119", "243", "166", "257", "147", "187", "211", "289", 
    "404", "1,411", "176", "329", "188", "262", "227", "221", 
    "276", "201", "512", "1,077", "7,869", "564", "591", "128", 
    "222", "1,160"), Larceny..theft = c("", "1,569", "2,241", 
    "1,409", "3,581", "3,901", "1,224", "19,461", "2,362", "860", 
    "2,841", "10,959", "4,643", "2,969", "910", "4,875", "1,915", 
    "1,115", "830", "1,244", "1,024", "1,611", "1,082", "1,486", 
    "517", "785", "827", "677", "892", "933", "1,121", "928", 
    "1,620", "6,126", "1,460", "1,438", "1,156", "1,468", "1,485", 
    "822", "1,307", "791", "981", "3,688", "33,364", "2,819", 
    "1,828", "435", "604", "7,021"), Motor..vehicle..theft = c("", 
    "109", "161", "82", "493", "461", "91", "3,928", "141", "90", 
    "270", "1,276", "525", "697", "291", "1,458", "263", "106", 
    "82", "354", "87", "279", "262", "160", "83", "325", "171", 
    "282", "89", "250", "307", "466", "363", "1,091", "242", 
    "329", "147", "704", "159", "339", "87", "397", "362", "1,171", 
    "8,860", "582", "514", "98", "231", "2,496"), Arson3 = c(NA, 
    NA, 8L, 5L, 34L, 7L, 4L, 140L, 7L, 3L, 3L, 92L, 24L, 13L, 
    28L, 116L, 21L, 7L, 5L, 9L, 2L, 12L, 10L, 12L, 3L, 8L, 9L, 
    10L, 6L, 5L, 11L, 5L, 6L, 129L, 8L, 6L, 5L, 9L, 9L, 7L, 2L, 
    0L, 14L, 57L, 897L, 26L, 5L, 2L, 4L, 106L)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -50L))

Problem

I have data that looks like this, where each state has multiple counties, and thus the state rows have several empty values:

# A tibble: 276 × 14
   State     City         X Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³
   <chr>     <chr>    <int> <chr>   <chr>    <int> <chr> <chr>   <chr>  
 1 "ALABAMA" HUNTSVI…  2018 196,620 ""          NA ""    ""      ""     
 2 ""        TUSCALO…  2018 101,764 "253"        3 "20"  "71"    "159"  
 3 "ARIZONA" CHANDLER  2018 255,986 "287"        1 "74"  "62"    "150"  
 4 ""        GILBERT   2018 247,463 "132"        1 "49"  "15"    "67"   
 5 ""        GLENDALE  2018 249,799 "594"        7 "69"  "171"   "347"  
 6 ""        MESA      2018 504,873 "967"       10 "143" "192"   "622"  
 7 ""        PEORIA    2018 170,177 "201"        2 "33"  "29"    "137"  
 8 ""        PHOENIX   2018 1,653,… "6,118"     67 "566" "1,438" "4,047"
 9 ""        SCOTTSD…  2018 254,961 "211"        7 "53"  "52"    "99"   
10 ""        SURPRISE  2018 136,611 "67"         3 "10"  "20"    "34"

For this part of the data, Alabama should fill the first two rows, and Arizona should be filling the last 8 rows that are empty. I tried using the methods in this post 6 years ago but using this code (supplying as.character since that is what most of the data is here):

library(tidyverse)

crime_state %>%
  mutate_all(as.character) %>%
  fill(names(.), 
       .direction = "up")

I have no change in the rows I need fixed. Is there a better alternative for my data? This would be the ideal tibble:

# A tibble: 276 × 14
   State     City     X     Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³
   <chr>     <chr>    <chr> <chr>   <chr>   <chr>  <chr> <chr>   <chr>  
 1 "ALABAMA" HUNTSVI… 2018  196,620 ""      3      ""    ""      ""     
 2 "ALABAMA" TUSCALO… 2018  101,764 "253"   3      "20"  "71"    "159"  
 3 "ARIZONA" CHANDLER 2018  255,986 "287"   1      "74"  "62"    "150"  
 4 "ARIZONA" GILBERT  2018  247,463 "132"   1      "49"  "15"    "67"   
 5 "ARIZONA" GLENDALE 2018  249,799 "594"   7      "69"  "171"   "347"  
 6 "ARIZONA" MESA     2018  504,873 "967"   10     "143" "192"   "622"  
 7 "ARIZONA" PEORIA   2018  170,177 "201"   2      "33"  "29"    "137"  
 8 "ARIZONA" PHOENIX  2018  1,653,… "6,118" 67     "566" "1,438" "4,047"
 9 "ARIZONA" SCOTTSD… 2018  254,961 "211"   7      "53"  "52"    "99"   
10 "ARIZONA" SURPRISE 2018  136,611 "67"    3      "10"  "20"    "34" 

CodePudding user response:

What about:

[...]
crime_state |>
  dplyr::mutate(State = ifelse(State == "", NA, State)) |>
  tidyr::fill(State)
#> # A tibble: 50 × 14
#>    State   City           X Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³ Prope…⁴
#>    <chr>   <chr>      <int> <chr>   <chr>    <int> <chr> <chr>   <chr>   <chr>  
#>  1 ALABAMA HUNTSVILL…  2018 196,620 ""          NA ""    ""      ""      ""     
#>  2 ALABAMA TUSCALOOSA  2018 101,764 "253"        3 "20"  "71"    "159"   "2,092"
#>  3 ARIZONA CHANDLER    2018 255,986 "287"        1 "74"  "62"    "150"   "2,771"
#>  4 ARIZONA GILBERT     2018 247,463 "132"        1 "49"  "15"    "67"    "1,705"
#>  5 ARIZONA GLENDALE    2018 249,799 "594"        7 "69"  "171"   "347"   "4,830"
#>  6 ARIZONA MESA        2018 504,873 "967"       10 "143" "192"   "622"   "5,113"
#>  7 ARIZONA PEORIA      2018 170,177 "201"        2 "33"  "29"    "137"   "1,596"
#>  8 ARIZONA PHOENIX     2018 1,653,… "6,118"     67 "566" "1,438" "4,047" "28,85…
#>  9 ARIZONA SCOTTSDALE  2018 254,961 "211"        7 "53"  "52"    "99"    "2,853"
#> 10 ARIZONA SURPRISE    2018 136,611 "67"         3 "10"  "20"    "34"    "1,088"
#> # … with 40 more rows, 4 more variables: Burglary <chr>, Larceny..theft <chr>,
#> #   Motor..vehicle..theft <chr>, Arson3 <int>, and abbreviated variable names
#> #   ¹​Population1, ²​Violent..crime, ³​Aggravated..assault, ⁴​Property..crime

Created on 2022-10-22 with reprex v2.0.2

CodePudding user response:

I suggest a simplfication from @Grzegorz great approach. This is the case for dplyr::na_if:

library(dplyr)
library(tidyr)

library(tidyverse)
crime_state %>%
    mutate(State = na_if(State, '')) %>%
    fill(State)

 A tibble: 50 × 14
   State   City            X Population1 Violent..crime Murder Rape2 Robbery Aggravated..assault Property..crime Burglary Larceny..theft Motor..vehicle..t…¹ Arson3
   <chr>   <chr>       <int> <chr>       <chr>           <int> <chr> <chr>   <chr>               <chr>           <chr>    <chr>          <chr>                <int>
 1 ALABAMA HUNTSVILLE4  2018 196,620     ""                 NA ""    ""      ""                  ""              ""       ""             ""                      NA
 2 ALABAMA TUSCALOOSA   2018 101,764     "253"               3 "20"  "71"    "159"               "2,092"         "414"    "1,569"        "109"                   NA
 3 ARIZONA CHANDLER     2018 255,986     "287"               1 "74"  "62"    "150"               "2,771"         "369"    "2,241"        "161"                    8
 4 ARIZONA GILBERT      2018 247,463     "132"               1 "49"  "15"    "67"                "1,705"         "214"    "1,409"        "82"                     5
 5 ARIZONA GLENDALE     2018 249,799     "594"               7 "69"  "171"   "347"               "4,830"         "756"    "3,581"        "493"                   34
 6 ARIZONA MESA         2018 504,873     "967"              10 "143" "192"   "622"               "5,113"         "751"    "3,901"        "461"                    7
 7 ARIZONA PEORIA       2018 170,177     "201"               2 "33"  "29"    "137"               "1,596"         "281"    "1,224"        "91"                     4
 8 ARIZONA PHOENIX      2018 1,653,080   "6,118"            67 "566" "1,438" "4,047"             "28,854"        "5,465"  "19,461"       "3,928"                140
 9 ARIZONA SCOTTSDALE   2018 254,961     "211"               7 "53"  "52"    "99"                "2,853"         "350"    "2,362"        "141"                    7
10 ARIZONA SURPRISE     2018 136,611     "67"                3 "10"  "20"    "34"                "1,088"         "138"    "860"          "90"                     3
# … with 40 more rows, and abbreviated variable name ¹​Motor..vehicle..theft
# ℹ Use `print(n = ...)` to see more rows
  • Related