I merged to datasets (dt, dt2) based on the exact match between company names in order to assign group_id to the second dataset (dt2). Now, I would like to fill the empty values of group_id with the ones from the same company (different branches) but having different names.
I constructed the data using this merge code
dt2 <- merge(dt2,dt[,c("psn_name_PAT","group_id")],by = c("psn_name_PAT"),all.x = T,all.y = F)
DATA
psn_name_PAT n name_std group_id
1 CHICHIBU FUJI COMPANY 2 CHICHIBU FUJI NA
2 FUJI CERAMICS 1 FUJI CERAMICS NA
3 FUJI CHEMICAL 1 FUJI 606
4 FUJI ELECTRIC COMPANY 439 FUJI 606
5 FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT 5 FUJI 606
6 FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT 12 FUJI 606
DESIRED Output:
psn_name_PAT n name_std group_id
1 CHICHIBU FUJI COMPANY 2 CHICHIBU FUJI 606
2 FUJI CERAMICS 1 FUJI CERAMICS 606
3 FUJI CHEMICAL 1 FUJI 606
4 FUJI ELECTRIC COMPANY 439 FUJI 606
5 FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT 5 FUJI 606
6 FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT 12 FUJI 606
I tried the following code as a way to match 'not-exact word' matches from another dataset (dt2), but unfortunately it brings different group_id not the once from the right company (i.e. FUJI):
dt2 <- dt2 %>%
mutate(group_id=ifelse(grepl(paste0("\\",fuz$name_std,"\\b", collapse = "|"), name_std),fuz$group_id,NA))
Any idea on how I could assign the right company group_id (i.e. FUJI) to different branches names?
Thank you in advance for your help!
DATASET
dt
structure(list(psn_name_PAT = c("FUJI CHEMICAL", "FUJI ELECTRIC COMPANY",
"FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT", "FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT",
"FUJI ELECTRIC CORPORATION RESEARCH AND DEVELOPMENT", "FUJI ELECTRIC SYSTEMS COMPANY",
"FUJI ELECTRIC TECHNOLOGY COMPANY", "FUJI MACHINE MANUFACTURING COMPANY",
"FUJITSU", "FUJITSU GENERAL", "FUJIMI", "FUJIFILM", "FUJIFILM ELECTRONIC MATERIALS U.S.A.",
"FUJIFILM MANUFACTURING EUROPE", "FUJIKURA", "FUJI MACHINERY MFG. & ELECTRONICS COMPANY",
"FUJI XEROX COMPANY", "FUJIKIN", "FUJIKOSHI MACHINERY CORPORATION",
"HONGFUJIN PRECISION INDUSTRY (SHENZHEN) COMPANY", "TOSHIBA AMERICA RESEARCH",
"TOSHIBA CORPORATION", "TOSHIBA MACHINE COMPANY", "TOSHIBA MATERIALS COMPANY",
"TOSHIBA MEDICAL SYSTEMS CORPORATION", "TOSHIBA SOLUTIONS CORPORATION"
), name_std = c("FUJI", "FUJI", "FUJI", "FUJI", "FUJI", "FUJI",
"FUJI", "FUJI", "FUJITSU", "FUJITSU", "FUJIMI", "FUJIFILM", "FUJIFILM",
"FUJIFILM", "FUJIKURA", "FUJI MFG", "FUJI XEROX", "FUJIKIN",
"FUJIKOSHI", "HONGFUJIN", "TOSHIBA", "TOSHIBA", "TOSHIBA", "TOSHIBA",
"TOSHIBA", "TOSHIBA"), n = c(45L, 45L, 45L, 45L, 45L, 45L, 45L,
45L, 6L, 6L, 4L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), group_id = c(606L, 606L, 606L, 606L, 606L, 606L,
606L, 606L, 614L, 614L, 613L, 609L, 609L, 609L, 612L, 607L, 608L,
610L, 611L, 705L, 1631L, 1631L, 1631L, 1631L, 1631L, 1631L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -26L))
dt2
structure(list(psn_name_PAT = c("CHICHIBU FUJI COMPANY", "FUJI CERAMICS",
"FUJI CHEMICAL", "FUJI ELECTRIC COMPANY", "FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT",
"FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT", "FUJI ELECTRIC CORPORATION RESEARCH AND DEVELOPMENT",
"FUJI ELECTRIC DEVICE TECHNOLOGY COMPANY", "FUJI ELECTRIC SYSTEMS COMPANY",
"FUJI ELECTRIC TECHNOLOGY COMPANY", "FUJI ELECTROCHEMICAL COMPANY",
"FUJI FILM MICRODEVICES COMPANY", "FUJI HEAVY IND", "FUJI MACHINE MANUFACTURING COMPANY",
"FUJI MACHINERY MFG. & ELECTRONICS COMPANY", "FUJI SEIKI MACHINE WORKS",
"FUJI XEROX COMPANY", "FUJIFILM", "FUJIFILM DIAMATIX", "FUJIFILM DIMATIX",
"FUJIFILM ELECTRONIC IMAGING", "FUJIFILM ELECTRONIC MATERIALS U.S.A.",
"FUJIFILM MANUFACTURING EUROPE", "FUJIKIN", "FUJIKOSHI KIKAI KOGYO",
"FUJIKOSHI MACHINERY CORPORATION", "FUJIKURA", "FUJIMA", "FUJIMI",
"FUJINON CORPORATION", "FUJITSU", "FUJITSU AMD SEMICONDUCTOR",
"FUJITSU AMD SEMICONDUCTOR LIMITED (FASL)", "FUJITSU AND SEMICONDUCTOR",
"FUJITSU AUTOMATION", "FUJITSU COMPONENT", "FUJITSU DISPLAY TECHNOLOGIES CORPORATION",
"FUJITSU FRONTECH", "FUJITSU GENERAL", "FUJITSU HITACHI PLASMA DISPLAY",
"FUJITSU LIMITED KABUSHIKI KAISHA TOSHIBA", "FUJITSU MEDIA DEVICES",
"FUJITSU MICROCOMPUTER SYSTEMS", "FUJITSU MICROELECTRONICS",
"FUJITSU MICROELECTRONICS LMIITED", "FUJITSU MIYAGI ELECTRONICS",
"FUJITSU QUANTUM DEVICES", "FUJITSU SEMICONDUCTOR", "FUJITSU TAKAMISAWA COMPONENT",
"FUJITSU TEN", "FUJITSU VLSI", "FUJITSU YAMANASHI ELECTRONICS",
"HONGFUJIN PRECISION INDUSTRY (SHENZHEN) COMPANY", "KYUSHU FUJITSU ELECTRONICS",
"NEC TOSHIBA SPACE SYSTEMS", "SUZUKA FUJI XEROX COMPANY", "TOSHIBA AMERICA ELECTRONIC COMPONENTS",
"TOSHIBA AMERICA RESEARCH", "TOSHIBA AUTOMATION COMPANY", "TOSHIBA AVE CORPORATION",
"TOSHIBA CERAMICS COMPANY", "TOSHIBA COMPONENTS COMPANY", "TOSHIBA CORPORATION",
"TOSHIBA KIKAI", "TOSHIBA LIGHTING & TECHNOLOGY CORPORATION",
"TOSHIBA MACHINE COMPANY", "TOSHIBA MATERIALS COMPANY", "TOSHIBA MATSUSHITA DISPLAY TECHNOLOGY COMPANY",
"TOSHIBA MEDICAL SYSTEMS CORPORATION", "TOSHIBA MICROELECTRONICS CORPORATION",
"TOSHIBA MOBILE DISPLAY COMPANY", "TOSHIBA SHIBAURA DENKI", "TOSHIBA SILICONE COMPANY",
"TOSHIBA SOLUTIONS CORPORATION", "TOSHIBA TEC CORPORATION", "TOSHIBA TECHNO CENTER"
), n = c(2L, 1L, 1L, 439L, 5L, 12L, 1L, 43L, 38L, 1L, 1L, 1L,
1L, 2L, 2L, 4L, 129L, 549L, 1L, 5L, 1L, 1L, 2L, 5L, 1L, 1L, 35L,
1L, 7L, 20L, 2430L, 7L, 1L, 2L, 1L, 2L, 4L, 2L, 1L, 1L, 1L, 36L,
1L, 126L, 1L, 1L, 56L, 312L, 1L, 3L, 25L, 1L, 6L, 13L, 1L, 1L,
14L, 1L, 1L, 1L, 19L, 1L, 5609L, 7L, 5L, 5L, 15L, 22L, 8L, 1L,
2L, 1L, 1L, 1L, 3L, 4L), name_std = c("CHICHIBU FUJI", "FUJI CERAMICS",
"FUJI", "FUJI", "FUJI", "FUJI", "FUJI", "FUJI DEVICE", "FUJI",
"FUJI", "FUJI ELECTROCHEMICAL", "FUJI MICRODEVICES", "FUJI HEAVY IND",
"FUJI", "FUJI MFG", "FUJI SEIKI WORKS", "FUJI XEROX", "FUJIFILM",
"FUJIFILM DIAMATIX", "FUJIFILM DIMATIX", "FUJIFILM IMAGING",
"FUJIFILM", "FUJIFILM", "FUJIKIN", "FUJIKOSHI KIKAI", "FUJIKOSHI",
"FUJIKURA", "FUJIMA", "FUJIMI", "FUJINON", "FUJITSU", "FUJITSU AMD SEMICONDUCTOR",
"FUJITSU AMD SEMICONDUCTOR FASL", "FUJITSU SEMICONDUCTOR", "FUJITSU AUTOMATION",
"FUJITSU COMPONENT", "FUJITSU DISPLAY", "FUJITSU FRONTECH", "FUJITSU",
"FUJITSU HITACHI PLASMA DISPLAY", "FUJITSU KABUSHIKI KAISHA TOSHIBA",
"FUJITSU DEVICES", "FUJITSU MICROCOMPUTER", "FUJITSU MICROELECTRONICS",
"FUJITSU MICROELECTRONICS LMIITED", "FUJITSU MIYAGI", "FUJITSU QUANTUM DEVICES",
"FUJITSU SEMICONDUCTOR", "FUJITSU TAKAMISAWA COMPONENT", "FUJITSU TEN",
"FUJITSU VLSI", "FUJITSU YAMANASHI", "HONGFUJIN", "KYUSHU FUJITSU",
"NEC TOSHIBA SPACE", "SUZUKA FUJI XEROX", "TOSHIBA COMPONENTS",
"TOSHIBA", "TOSHIBA AUTOMATION", "TOSHIBA AVE", "TOSHIBA CERAMICS",
"TOSHIBA COMPONENTS", "TOSHIBA", "TOSHIBA KIKAI", "TOSHIBA LIGHTING",
"TOSHIBA", "TOSHIBA", "TOSHIBA MATSUSHITA DISPLAY", "TOSHIBA",
"TOSHIBA MICROELECTRONICS", "TOSHIBA MOBILE DISPLAY", "TOSHIBA SHIBAURA DENKI",
"TOSHIBA SILICONE", "TOSHIBA", "TOSHIBA TEC", "TOSHIBA TECHNO"
), group_id = c(NA, NA, 606L, 606L, 606L, 606L, 606L, NA, 606L,
606L, NA, NA, NA, 606L, 607L, NA, 608L, 609L, NA, NA, NA, 609L,
609L, 610L, NA, 611L, 612L, NA, 613L, NA, 614L, NA, NA, NA, NA,
NA, NA, NA, 614L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 705L, NA, NA, NA, NA, 1631L, NA, NA, NA, NA, 1631L, NA,
NA, 1631L, 1631L, NA, 1631L, NA, NA, NA, NA, 1631L, NA, NA)), row.names = c(NA,
-76L), class = "data.frame")
CodePudding user response:
Allthough I do not exactly grasp the kind of granulation of your join, I believe that you need this kind of code:
The only thing you have to define is the max_dist
:
With max_dist
we could define the Maximum distance to use for joining
See: ?stringdist_left_join
library(dplyr)
library(fuzzyjoin)
library(tidyr)
fuzzyjoin::stringdist_full_join(x=dt, y=dt2, max_dist = .70,
by='name_std',
method = 'jaccard',
distance_col = "dist") %>%
group_by(name_std.x) %>% # here you may want to group by psn_name_PAT.x
fill(group_id.y, .direction = "updown") %>%
select(psn_name_PAT = psn_name_PAT.y, n=n.y, group_id=group_id.y)
name_std.x psn_name_PAT n group_id
<chr> <chr> <int> <int>
1 FUJI CHICHIBU FUJI COMPANY 2 606
2 FUJI FUJI CERAMICS 1 606
3 FUJI FUJI CHEMICAL 1 606
4 FUJI FUJI ELECTRIC COMPANY 439 606
5 FUJI FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT 5 606
6 FUJI FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT 12 606
7 FUJI FUJI ELECTRIC CORPORATION RESEARCH AND DEVELOPMENT 1 606
8 FUJI FUJI ELECTRIC DEVICE TECHNOLOGY COMPANY 43 606
9 FUJI FUJI ELECTRIC SYSTEMS COMPANY 38 606
10 FUJI FUJI ELECTRIC TECHNOLOGY COMPANY 1 606
# ... with 1,182 more rows
CodePudding user response:
library(dplyr)
library(stringr)
library(tibble)
# getting the company name from dt for which we have group id
company <- dt %>% as_tibble() %>%
distinct(group_id, .keep_all = TRUE) %>%
select(name_std, group_id)
# creating the pattern for matching
company_name <- company %>%
arrange(desc(name_std)) %>%
pull(name_std) %>%
str_c(collapse = "|") %>%
paste0("\\b",., "\\b")
dt2 <- dt2 %>%
as_tibble() %>%
mutate(
company_name = str_extract(name_std, company_name)
) %>%
left_join(company, by = c("company_name" = "name_std")) %>%
select(company_name, name_std, psn_name_PAT, group_id = group_id.y) %>%
arrange(group_id)
print(dt2, n = Inf)
#> # A tibble: 76 × 4
#> company_name name_std psn_name_PAT group_id
#> <chr> <chr> <chr> <int>
#> 1 FUJI CHICHIBU FUJI CHICHIBU FUJI COMPANY 606
#> 2 FUJI FUJI CERAMICS FUJI CERAMICS 606
#> 3 FUJI FUJI FUJI CHEMICAL 606
#> 4 FUJI FUJI FUJI ELECTRIC COMPANY 606
#> 5 FUJI FUJI FUJI ELECTRIC CORPORA… 606
#> 6 FUJI FUJI FUJI ELECTRIC CORPORA… 606
#> 7 FUJI FUJI FUJI ELECTRIC CORPORA… 606
#> 8 FUJI FUJI DEVICE FUJI ELECTRIC DEVICE … 606
#> 9 FUJI FUJI FUJI ELECTRIC SYSTEMS… 606
#> 10 FUJI FUJI FUJI ELECTRIC TECHNOL… 606
#> 11 FUJI FUJI ELECTROCHEMICAL FUJI ELECTROCHEMICAL … 606
#> 12 FUJI FUJI MICRODEVICES FUJI FILM MICRODEVICE… 606
#> 13 FUJI FUJI HEAVY IND FUJI HEAVY IND 606
#> 14 FUJI FUJI FUJI MACHINE MANUFACT… 606
#> 15 FUJI FUJI SEIKI WORKS FUJI SEIKI MACHINE WO… 606
#> 16 FUJI MFG FUJI MFG FUJI MACHINERY MFG. &… 607
#> 17 FUJI XEROX FUJI XEROX FUJI XEROX COMPANY 608
#> 18 FUJI XEROX SUZUKA FUJI XEROX SUZUKA FUJI XEROX COM… 608
#> 19 FUJIFILM FUJIFILM FUJIFILM 609
#> 20 FUJIFILM FUJIFILM DIAMATIX FUJIFILM DIAMATIX 609
#> 21 FUJIFILM FUJIFILM DIMATIX FUJIFILM DIMATIX 609
#> 22 FUJIFILM FUJIFILM IMAGING FUJIFILM ELECTRONIC I… 609
#> 23 FUJIFILM FUJIFILM FUJIFILM ELECTRONIC M… 609
#> 24 FUJIFILM FUJIFILM FUJIFILM MANUFACTURIN… 609
#> 25 FUJIKIN FUJIKIN FUJIKIN 610
#> 26 FUJIKOSHI FUJIKOSHI KIKAI FUJIKOSHI KIKAI KOGYO 611
#> 27 FUJIKOSHI FUJIKOSHI FUJIKOSHI MACHINERY C… 611
#> 28 FUJIKURA FUJIKURA FUJIKURA 612
#> 29 FUJIMI FUJIMI FUJIMI 613
#> 30 FUJITSU FUJITSU FUJITSU 614
#> 31 FUJITSU FUJITSU AMD SEMICONDUCTOR FUJITSU AMD SEMICONDU… 614
#> 32 FUJITSU FUJITSU AMD SEMICONDUCTOR FASL FUJITSU AMD SEMICONDU… 614
#> 33 FUJITSU FUJITSU SEMICONDUCTOR FUJITSU AND SEMICONDU… 614
#> 34 FUJITSU FUJITSU AUTOMATION FUJITSU AUTOMATION 614
#> 35 FUJITSU FUJITSU COMPONENT FUJITSU COMPONENT 614
#> 36 FUJITSU FUJITSU DISPLAY FUJITSU DISPLAY TECHN… 614
#> 37 FUJITSU FUJITSU FRONTECH FUJITSU FRONTECH 614
#> 38 FUJITSU FUJITSU FUJITSU GENERAL 614
#> 39 FUJITSU FUJITSU HITACHI PLASMA DISPLAY FUJITSU HITACHI PLASM… 614
#> 40 FUJITSU FUJITSU KABUSHIKI KAISHA TOSHIBA FUJITSU LIMITED KABUS… 614
#> 41 FUJITSU FUJITSU DEVICES FUJITSU MEDIA DEVICES 614
#> 42 FUJITSU FUJITSU MICROCOMPUTER FUJITSU MICROCOMPUTER… 614
#> 43 FUJITSU FUJITSU MICROELECTRONICS FUJITSU MICROELECTRON… 614
#> 44 FUJITSU FUJITSU MICROELECTRONICS LMIITED FUJITSU MICROELECTRON… 614
#> 45 FUJITSU FUJITSU MIYAGI FUJITSU MIYAGI ELECTR… 614
#> 46 FUJITSU FUJITSU QUANTUM DEVICES FUJITSU QUANTUM DEVIC… 614
#> 47 FUJITSU FUJITSU SEMICONDUCTOR FUJITSU SEMICONDUCTOR 614
#> 48 FUJITSU FUJITSU TAKAMISAWA COMPONENT FUJITSU TAKAMISAWA CO… 614
#> 49 FUJITSU FUJITSU TEN FUJITSU TEN 614
#> 50 FUJITSU FUJITSU VLSI FUJITSU VLSI 614
#> 51 FUJITSU FUJITSU YAMANASHI FUJITSU YAMANASHI ELE… 614
#> 52 FUJITSU KYUSHU FUJITSU KYUSHU FUJITSU ELECTR… 614
#> 53 HONGFUJIN HONGFUJIN HONGFUJIN PRECISION I… 705
#> 54 TOSHIBA NEC TOSHIBA SPACE NEC TOSHIBA SPACE SYS… 1631
#> 55 TOSHIBA TOSHIBA COMPONENTS TOSHIBA AMERICA ELECT… 1631
#> 56 TOSHIBA TOSHIBA TOSHIBA AMERICA RESEA… 1631
#> 57 TOSHIBA TOSHIBA AUTOMATION TOSHIBA AUTOMATION CO… 1631
#> 58 TOSHIBA TOSHIBA AVE TOSHIBA AVE CORPORATI… 1631
#> 59 TOSHIBA TOSHIBA CERAMICS TOSHIBA CERAMICS COMP… 1631
#> 60 TOSHIBA TOSHIBA COMPONENTS TOSHIBA COMPONENTS CO… 1631
#> 61 TOSHIBA TOSHIBA TOSHIBA CORPORATION 1631
#> 62 TOSHIBA TOSHIBA KIKAI TOSHIBA KIKAI 1631
#> 63 TOSHIBA TOSHIBA LIGHTING TOSHIBA LIGHTING & TE… 1631
#> 64 TOSHIBA TOSHIBA TOSHIBA MACHINE COMPA… 1631
#> 65 TOSHIBA TOSHIBA TOSHIBA MATERIALS COM… 1631
#> 66 TOSHIBA TOSHIBA MATSUSHITA DISPLAY TOSHIBA MATSUSHITA DI… 1631
#> 67 TOSHIBA TOSHIBA TOSHIBA MEDICAL SYSTE… 1631
#> 68 TOSHIBA TOSHIBA MICROELECTRONICS TOSHIBA MICROELECTRON… 1631
#> 69 TOSHIBA TOSHIBA MOBILE DISPLAY TOSHIBA MOBILE DISPLA… 1631
#> 70 TOSHIBA TOSHIBA SHIBAURA DENKI TOSHIBA SHIBAURA DENKI 1631
#> 71 TOSHIBA TOSHIBA SILICONE TOSHIBA SILICONE COMP… 1631
#> 72 TOSHIBA TOSHIBA TOSHIBA SOLUTIONS COR… 1631
#> 73 TOSHIBA TOSHIBA TEC TOSHIBA TEC CORPORATI… 1631
#> 74 TOSHIBA TOSHIBA TECHNO TOSHIBA TECHNO CENTER 1631
#> 75 <NA> FUJIMA FUJIMA NA
#> 76 <NA> FUJINON FUJINON CORPORATION NA
Created on 2022-07-11 by the reprex package (v2.0.1)