I have data as follows:
dat <- structure(list(ZIP_source1 = c(1026, 1026, 1026, 1026, 1026,
1026, 1026, 1026, 1026, 1026, 1017, 1012, 1012), ZIP_source2 = c(1026,
1026, 1026, 1026, 1026, 1026, NA, NA, NA, NA, NA, 1012, 1012),
Category_source2 = c(4, 4, 4, 4, 4, 4, NA, NA, NA, NA, NA, 4, 4)), class = c("data.table",
"data.frame"), row.names = c(NA, -13L))
dat
ZIP_source1 ZIP_source2 Category_source2
1: 1016 1016 4
2: 1016 1016 4
3: 1016 1016 4
4: 1016 1016 4
5: 1016 1016 4
6: 1016 1016 4
7: 1016 NA NA
8: 1016 NA NA
9: 1016 NA NA
10: 1016 NA NA
11: 1027 NA NA
12: 1022 1022 4
13: 1022 1022 4
For line 7 to 10, I know from source 1 what the zip code is. From source 2 I know that this zip code falls in category 4. What is the best way to do this?
Desired output:
ZIP_source1 ZIP_source2 Category_source2
1: 1016 1016 4
2: 1016 1016 4
3: 1016 1016 4
4: 1016 1016 4
5: 1016 1016 4
6: 1016 1016 4
7: 1016 NA 4
8: 1016 NA 4
9: 1016 NA 4
10: 1016 NA 4
11: 1027 NA NA
12: 1022 1022 4
13: 1022 1022 4
CodePudding user response:
We can use fill
library(dplyr)
library(tidyr)
dat %>%
group_by(ZIP_source1) %>%
fill(Category_source2, .direction = "downup")
Or using nafill
library(data.table)
dat[, Category_source2 := nafill(nafill(Category_source2,
type = "locf"), type = "nocb"), ZIP_source1]
-output
> dat
ZIP_source1 ZIP_source2 Category_source2
<num> <num> <num>
1: 1026 1026 4
2: 1026 1026 4
3: 1026 1026 4
4: 1026 1026 4
5: 1026 1026 4
6: 1026 1026 4
7: 1026 NA 4
8: 1026 NA 4
9: 1026 NA 4
10: 1026 NA 4
11: 1017 NA NA
12: 1012 1012 4
13: 1012 1012 4
CodePudding user response:
I'd prefer to create new columns to do this, which I will call zip
and category
, but it's straightforward to overwrite the original columns if you want.
# Get all zips where not NA in one column
dat <- dat %>%
mutate(
zip = coalesce(ZIP_source1, ZIP_source2)
)
# Create table of all categories
category_table <- dat %>%
select(Category_source2, zip) %>%
drop_na() %>%
group_by(zip) %>%
distinct() %>%
rename(category = Category_source2)
category_table
# category zip
# <dbl> <dbl>
# 1 4 1026
# 2 4 1012
# Join as new column
left_join(dat, category_table, by = "zip")
# left_join(dat, category_table, by = "zip")
# ZIP_source1 ZIP_source2 Category_source2 zip category
# 1 1026 1026 4 1026 4
# 2 1026 1026 4 1026 4
# 3 1026 1026 4 1026 4
# 4 1026 1026 4 1026 4
# 5 1026 1026 4 1026 4
# 6 1026 1026 4 1026 4
# 7 1026 NA NA 1026 4
# 8 1026 NA NA 1026 4
# 9 1026 NA NA 1026 4
# 10 1026 NA NA 1026 4
# 11 1017 NA NA 1017 NA
# 12 1012 1012 4 1012 4
# 13 1012 1012 4 1012 4