Replace value with NA based on whether variable matches an item in a vector(-CodePudding

My question is more about how to improve what I suspect is inefficient code.

I have two dataframes: one contains data on county-level disaster information and the other contains data on county-level income per-capita. As a first step, I am interested in identifying for which counties we are missing per-capita income data. Here is what that looks like with sample dataframes:

counties <- data.frame(polyname = c("alabama,autauga","alabama,autauga",
                                    "alabama,baldwin","alabama,baldwin",
                                    "alabama,barbour","alabama,barbour", 
                                    "alabama,bibb", "alabama,bibb"), 
                       indAnyDisaster_frequency = c("1-2", "1-2", "0", "0", 
                                                   "3-5", "3-5", "1-2","1-2"))
counties_persinc_1980 <- data.frame(polyname = c("alabama,autauga","alabama,autauga",
                                                 "alabama,baldwin","alabama,baldwin",
                                                 "alabama,barbour","alabama,barbour", 
                                                 "alabama,bibb", "alabama,bibb"), 
                                    persinc_1980 = c(NA, NA, NA, NA, 25, 30, 32, 28))
no_persinc_1980 <- unique(counties_persinc_1980$polyname[is.na(counties_persinc_1980$persinc_1980)])

Now, I want to use the vector of missing county names to replace counties$indAnyDisaster_frequency with NA if the counties$polyname of the same index matches an element of the vector. I believe I have been able to achieve this with a for-loop, but I don't believe it is very efficient. However, I have not been able to figure out how to use lapply to achieve the same outcome. I've included both the code for the loop and one of my attempts at using lapply.

for(i in 1:length(no_persinc_1980)){
  counties$indAnyDisaster_frequency[counties$polyname==no_persinc_1980[i]] <- NA
}
lapply(1:length(no_persinc_1980), function(x) counties$indAnyDisaster_frequency[counties$polyname==no_persinc_1980[x]] <- NA)

Any guidance on how to improve this approach would be appreciated.

CodePudding user response：

No loop necessary. I would look into %in%

counties$indAnyDisaster_frequency[counties$polyname %in% no_persinc_1980] <- NA

counties
#>          polyname indAnyDisaster_frequency
#> 1 alabama,autauga                     <NA>
#> 2 alabama,autauga                     <NA>
#> 3 alabama,baldwin                     <NA>
#> 4 alabama,baldwin                     <NA>
#> 5 alabama,barbour                      3-5
#> 6 alabama,barbour                      3-5
#> 7    alabama,bibb                      1-2
#> 8    alabama,bibb                      1-2

CodePudding user response：

idx <- which(counties$polyname %in% no_persinc_1980)
counties[ idx, 'indAnyDisaster_frequency' ] <- NA

CodePudding user response：

library(tidyverse)

Pull the names of counties with NA in column persinc_1980

counties_nas <- counties_persinc_1980 %>% 
  filter(is.na(persinc_1980)) %>% 
  unique() %>% 
  pull(polyname)

Change indAnyDisaster_frequency into NA if polyname exists in the vector

counties %>%  
  mutate(indAnyDisaster_frequency = case_when(polyname %in% counties_nas ~ NA_character_, 
                                              TRUE ~ indAnyDisaster_frequency))

  polyname        indAnyDisaster_frequency
  <chr>           <chr>                   
1 alabama,autauga NA                      
2 alabama,autauga NA                      
3 alabama,baldwin NA                      
4 alabama,baldwin NA                      
5 alabama,barbour 3-5                     
6 alabama,barbour 3-5                     
7 alabama,bibb    1-2                     
8 alabama,bibb    1-2

CodePudding user response：

I recommend joining the two data frames together. It's almost always the best way to go about things.

library(tidyverse)

counties <- data.frame(polyname = c("alabama,autauga","alabama,autauga",
                                    "alabama,baldwin","alabama,baldwin",
                                    "alabama,barbour","alabama,barbour", 
                                    "alabama,bibb", "alabama,bibb"), 
                       indAnyDisaster_frequency = c("1-2", "1-2", "0", "0", 
                                                    "3-5", "3-5", "1-2","1-2"))

counties_persinc_1980 <- data.frame(polyname = c("alabama,autauga","alabama,autauga",
                                                 "alabama,baldwin","alabama,baldwin",
                                                 "alabama,barbour","alabama,barbour", 
                                                 "alabama,bibb", "alabama,bibb"), 
                                    persinc_1980 = c(NA, NA, NA, NA, 25, 30, 32, 28))

# join
disasters <- left_join(counties, counties_persinc_1980, by = "polyname")
print(disasters)
#>           polyname indAnyDisaster_frequency persinc_1980
#> 1  alabama,autauga                      1-2           NA
#> 2  alabama,autauga                      1-2           NA
#> 3  alabama,autauga                      1-2           NA
#> 4  alabama,autauga                      1-2           NA
#> 5  alabama,baldwin                        0           NA
#> 6  alabama,baldwin                        0           NA
#> 7  alabama,baldwin                        0           NA
#> 8  alabama,baldwin                        0           NA
#> 9  alabama,barbour                      3-5           25
#> 10 alabama,barbour                      3-5           30
#> 11 alabama,barbour                      3-5           25
#> 12 alabama,barbour                      3-5           30
#> 13    alabama,bibb                      1-2           32
#> 14    alabama,bibb                      1-2           28
#> 15    alabama,bibb                      1-2           32
#> 16    alabama,bibb                      1-2           28

# which missing
disasters %>% 
  filter(is.na(persinc_1980)) %>%
  pull(polyname) %>% 
  unique()
#> [1] "alabama,autauga" "alabama,baldwin"

^{Created on 2022-10-26 with reprex v2.0.2}