I have a table with prefixes (here in csv format):
PREFIX,LABEL
A,Infectious diseases
B,Infectious diseases
C,Tumor
D1,Tumor
D2,Tumor
D31,Tumor
D32,Tumor
D33,Blood disorder
D4,Blood disorder
D5,Blood disorder
And I want to join it with this one:
AGE,DEATH_CODE
67,A02
85,D318
75,C007 X
62,D338
To get obviously:
AGE,LABEL
67,Infectious diseases
85,Tumor
75,Tumor
62,Blood disorder
I know how to do that with SQL and LIKE
but not with tidyverse left_join
or base R.
Dput of data
Table 1: CIM_CODES
structure(list(PREFIX = c("A", "B", "C", "D1", "D2", "D31", "D32",
"D33", "D4", "D5"), LABEL = c("Infectious diseases", "Infectious diseases",
"Tumor", "Tumor", "Tumor", "Tumor", "Tumor", "Blood disorder",
"Blood disorder", "Blood disorder")), row.names = c(NA, -10L), spec = structure(list(
cols = list(PREFIX = structure(list(), class = c("collector_character",
"collector")), LABEL = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x000002527d306190>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
Table 2: DEATH_CAUSES
structure(list(AGE = c(67, 85, 75, 62), DEATH_CODE = c("A02",
"D318", "C007 X", "D338")), row.names = c(NA, -4L), spec = structure(list(
cols = list(AGE = structure(list(), class = c("collector_double",
"collector")), DEATH_CODE = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x0000025273898c60>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
CodePudding user response:
You could do a partial string match that has the lowest difference between the two columns:
library(tidyverse)
DEATH_CAUSES |>
mutate(LABEL = map_chr(DEATH_CODE,
~CIM_CODES$LABEL[
which.min(stringdist::stringdist(.x, CIM_CODES$PREFIX))
]))
#> # A tibble: 4 x 3
#> AGE DEATH_CODE LABEL
#> <dbl> <chr> <chr>
#> 1 67 A02 Infectious diseases
#> 2 85 D318 Tumor
#> 3 75 C007 X Tumor
#> 4 62 D338 Blood disorder
UPDATE
not using the stringdist
package as requested.
library(tidyverse)
get_match <- function(code, prefix, target){
map(code, \(x){
map(prefix, \(y){
grepl(paste0("^", y), x)
})
}) |>
map_chr(\(z) target[unlist(z) |> which()] )
}
DEATH_CAUSES |>
mutate(LABEL = get_match(DEATH_CAUSES$DEATH_CODE,
CIM_CODES$PREFIX,
CIM_CODES$LABEL))
#> # A tibble: 4 x 3
#> AGE DEATH_CODE LABEL
#> <dbl> <chr> <chr>
#> 1 67 A02 Infectious diseases
#> 2 85 D318 Tumor
#> 3 75 C007 X Tumor
#> 4 62 D338 Blood disorder
EDIT how to do this with a join:
library(tidyverse)
library(fuzzyjoin)
fuzzy_left_join(DEATH_CAUSES,
CIM_CODES,
by = c("DEATH_CODE" = "PREFIX"),
str_detect)
#> # A tibble: 4 x 4
#> AGE DEATH_CODE PREFIX LABEL
#> <dbl> <chr> <chr> <chr>
#> 1 67 A02 A Infectious diseases
#> 2 85 D318 D31 Tumor
#> 3 75 C007 X C Tumor
#> 4 62 D338 D33 Blood disorder
CodePudding user response:
My code below, I used mysql:
select a.age, p.label
from prefix p
left join age a on a.death_code like CONCAT("%",p.prefix,"%");
You can refer here: how to use a like with a join in sql?