Home > database >  Assign similar company names to same group id
Assign similar company names to same group id

Time:07-11

I merged to datasets (dt, dt2) based on the exact match between company names in order to assign group_id to the second dataset (dt2). Now, I would like to fill the empty values of group_id with the ones from the same company (different branches) but having different names.

I constructed the data using this merge code

dt2 <- merge(dt2,dt[,c("psn_name_PAT","group_id")],by = c("psn_name_PAT"),all.x = T,all.y = F)

DATA

                                      psn_name_PAT   n      name_std group_id
1                            CHICHIBU FUJI COMPANY   2 CHICHIBU FUJI       NA
2                                    FUJI CERAMICS   1 FUJI CERAMICS       NA
3                                    FUJI CHEMICAL   1          FUJI      606
4                            FUJI ELECTRIC COMPANY 439          FUJI      606
5   FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT   5          FUJI      606
6 FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT  12          FUJI      606

DESIRED Output:

                                      psn_name_PAT   n      name_std group_id
1                            CHICHIBU FUJI COMPANY   2 CHICHIBU FUJI      606
2                                    FUJI CERAMICS   1 FUJI CERAMICS      606
3                                    FUJI CHEMICAL   1          FUJI      606
4                            FUJI ELECTRIC COMPANY 439          FUJI      606
5   FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT   5          FUJI      606
6 FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT  12          FUJI      606

I tried the following code as a way to match 'not-exact word' matches from another dataset (dt2), but unfortunately it brings different group_id not the once from the right company (i.e. FUJI):

dt2 <- dt2 %>% 
  mutate(group_id=ifelse(grepl(paste0("\\",fuz$name_std,"\\b", collapse = "|"), name_std),fuz$group_id,NA))

Any idea on how I could assign the right company group_id (i.e. FUJI) to different branches names?

Thank you in advance for your help!


DATASET

dt

    structure(list(psn_name_PAT = c("FUJI CHEMICAL", "FUJI ELECTRIC COMPANY", 
"FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT", "FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT", 
"FUJI ELECTRIC CORPORATION RESEARCH AND DEVELOPMENT", "FUJI ELECTRIC SYSTEMS COMPANY", 
"FUJI ELECTRIC TECHNOLOGY COMPANY", "FUJI MACHINE MANUFACTURING COMPANY", 
"FUJITSU", "FUJITSU GENERAL", "FUJIMI", "FUJIFILM", "FUJIFILM ELECTRONIC MATERIALS U.S.A.", 
"FUJIFILM MANUFACTURING EUROPE", "FUJIKURA", "FUJI MACHINERY MFG. & ELECTRONICS COMPANY", 
"FUJI XEROX COMPANY", "FUJIKIN", "FUJIKOSHI MACHINERY CORPORATION", 
"HONGFUJIN PRECISION INDUSTRY (SHENZHEN) COMPANY", "TOSHIBA AMERICA RESEARCH", 
"TOSHIBA CORPORATION", "TOSHIBA MACHINE COMPANY", "TOSHIBA MATERIALS COMPANY", 
"TOSHIBA MEDICAL SYSTEMS CORPORATION", "TOSHIBA SOLUTIONS CORPORATION"
), name_std = c("FUJI", "FUJI", "FUJI", "FUJI", "FUJI", "FUJI", 
"FUJI", "FUJI", "FUJITSU", "FUJITSU", "FUJIMI", "FUJIFILM", "FUJIFILM", 
"FUJIFILM", "FUJIKURA", "FUJI MFG", "FUJI XEROX", "FUJIKIN", 
"FUJIKOSHI", "HONGFUJIN", "TOSHIBA", "TOSHIBA", "TOSHIBA", "TOSHIBA", 
"TOSHIBA", "TOSHIBA"), n = c(45L, 45L, 45L, 45L, 45L, 45L, 45L, 
45L, 6L, 6L, 4L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), group_id = c(606L, 606L, 606L, 606L, 606L, 606L, 
606L, 606L, 614L, 614L, 613L, 609L, 609L, 609L, 612L, 607L, 608L, 
610L, 611L, 705L, 1631L, 1631L, 1631L, 1631L, 1631L, 1631L)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -26L))

dt2

    structure(list(psn_name_PAT = c("CHICHIBU FUJI COMPANY", "FUJI CERAMICS", 
"FUJI CHEMICAL", "FUJI ELECTRIC COMPANY", "FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT", 
"FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT", "FUJI ELECTRIC CORPORATION RESEARCH AND DEVELOPMENT", 
"FUJI ELECTRIC DEVICE TECHNOLOGY COMPANY", "FUJI ELECTRIC SYSTEMS COMPANY", 
"FUJI ELECTRIC TECHNOLOGY COMPANY", "FUJI ELECTROCHEMICAL COMPANY", 
"FUJI FILM MICRODEVICES COMPANY", "FUJI HEAVY IND", "FUJI MACHINE MANUFACTURING COMPANY", 
"FUJI MACHINERY MFG. & ELECTRONICS COMPANY", "FUJI SEIKI MACHINE WORKS", 
"FUJI XEROX COMPANY", "FUJIFILM", "FUJIFILM DIAMATIX", "FUJIFILM DIMATIX", 
"FUJIFILM ELECTRONIC IMAGING", "FUJIFILM ELECTRONIC MATERIALS U.S.A.", 
"FUJIFILM MANUFACTURING EUROPE", "FUJIKIN", "FUJIKOSHI KIKAI KOGYO", 
"FUJIKOSHI MACHINERY CORPORATION", "FUJIKURA", "FUJIMA", "FUJIMI", 
"FUJINON CORPORATION", "FUJITSU", "FUJITSU AMD SEMICONDUCTOR", 
"FUJITSU AMD SEMICONDUCTOR LIMITED (FASL)", "FUJITSU AND SEMICONDUCTOR", 
"FUJITSU AUTOMATION", "FUJITSU COMPONENT", "FUJITSU DISPLAY TECHNOLOGIES CORPORATION", 
"FUJITSU FRONTECH", "FUJITSU GENERAL", "FUJITSU HITACHI PLASMA DISPLAY", 
"FUJITSU LIMITED KABUSHIKI KAISHA TOSHIBA", "FUJITSU MEDIA DEVICES", 
"FUJITSU MICROCOMPUTER SYSTEMS", "FUJITSU MICROELECTRONICS", 
"FUJITSU MICROELECTRONICS LMIITED", "FUJITSU MIYAGI ELECTRONICS", 
"FUJITSU QUANTUM DEVICES", "FUJITSU SEMICONDUCTOR", "FUJITSU TAKAMISAWA COMPONENT", 
"FUJITSU TEN", "FUJITSU VLSI", "FUJITSU YAMANASHI ELECTRONICS", 
"HONGFUJIN PRECISION INDUSTRY (SHENZHEN) COMPANY", "KYUSHU FUJITSU ELECTRONICS", 
"NEC TOSHIBA SPACE SYSTEMS", "SUZUKA FUJI XEROX COMPANY", "TOSHIBA AMERICA ELECTRONIC COMPONENTS", 
"TOSHIBA AMERICA RESEARCH", "TOSHIBA AUTOMATION COMPANY", "TOSHIBA AVE CORPORATION", 
"TOSHIBA CERAMICS COMPANY", "TOSHIBA COMPONENTS COMPANY", "TOSHIBA CORPORATION", 
"TOSHIBA KIKAI", "TOSHIBA LIGHTING & TECHNOLOGY CORPORATION", 
"TOSHIBA MACHINE COMPANY", "TOSHIBA MATERIALS COMPANY", "TOSHIBA MATSUSHITA DISPLAY TECHNOLOGY COMPANY", 
"TOSHIBA MEDICAL SYSTEMS CORPORATION", "TOSHIBA MICROELECTRONICS CORPORATION", 
"TOSHIBA MOBILE DISPLAY COMPANY", "TOSHIBA SHIBAURA DENKI", "TOSHIBA SILICONE COMPANY", 
"TOSHIBA SOLUTIONS CORPORATION", "TOSHIBA TEC CORPORATION", "TOSHIBA TECHNO CENTER"
), n = c(2L, 1L, 1L, 439L, 5L, 12L, 1L, 43L, 38L, 1L, 1L, 1L, 
1L, 2L, 2L, 4L, 129L, 549L, 1L, 5L, 1L, 1L, 2L, 5L, 1L, 1L, 35L, 
1L, 7L, 20L, 2430L, 7L, 1L, 2L, 1L, 2L, 4L, 2L, 1L, 1L, 1L, 36L, 
1L, 126L, 1L, 1L, 56L, 312L, 1L, 3L, 25L, 1L, 6L, 13L, 1L, 1L, 
14L, 1L, 1L, 1L, 19L, 1L, 5609L, 7L, 5L, 5L, 15L, 22L, 8L, 1L, 
2L, 1L, 1L, 1L, 3L, 4L), name_std = c("CHICHIBU FUJI", "FUJI CERAMICS", 
"FUJI", "FUJI", "FUJI", "FUJI", "FUJI", "FUJI DEVICE", "FUJI", 
"FUJI", "FUJI ELECTROCHEMICAL", "FUJI MICRODEVICES", "FUJI HEAVY IND", 
"FUJI", "FUJI MFG", "FUJI SEIKI WORKS", "FUJI XEROX", "FUJIFILM", 
"FUJIFILM DIAMATIX", "FUJIFILM DIMATIX", "FUJIFILM IMAGING", 
"FUJIFILM", "FUJIFILM", "FUJIKIN", "FUJIKOSHI KIKAI", "FUJIKOSHI", 
"FUJIKURA", "FUJIMA", "FUJIMI", "FUJINON", "FUJITSU", "FUJITSU AMD SEMICONDUCTOR", 
"FUJITSU AMD SEMICONDUCTOR FASL", "FUJITSU SEMICONDUCTOR", "FUJITSU AUTOMATION", 
"FUJITSU COMPONENT", "FUJITSU DISPLAY", "FUJITSU FRONTECH", "FUJITSU", 
"FUJITSU HITACHI PLASMA DISPLAY", "FUJITSU KABUSHIKI KAISHA TOSHIBA", 
"FUJITSU DEVICES", "FUJITSU MICROCOMPUTER", "FUJITSU MICROELECTRONICS", 
"FUJITSU MICROELECTRONICS LMIITED", "FUJITSU MIYAGI", "FUJITSU QUANTUM DEVICES", 
"FUJITSU SEMICONDUCTOR", "FUJITSU TAKAMISAWA COMPONENT", "FUJITSU TEN", 
"FUJITSU VLSI", "FUJITSU YAMANASHI", "HONGFUJIN", "KYUSHU FUJITSU", 
"NEC TOSHIBA SPACE", "SUZUKA FUJI XEROX", "TOSHIBA COMPONENTS", 
"TOSHIBA", "TOSHIBA AUTOMATION", "TOSHIBA AVE", "TOSHIBA CERAMICS", 
"TOSHIBA COMPONENTS", "TOSHIBA", "TOSHIBA KIKAI", "TOSHIBA LIGHTING", 
"TOSHIBA", "TOSHIBA", "TOSHIBA MATSUSHITA DISPLAY", "TOSHIBA", 
"TOSHIBA MICROELECTRONICS", "TOSHIBA MOBILE DISPLAY", "TOSHIBA SHIBAURA DENKI", 
"TOSHIBA SILICONE", "TOSHIBA", "TOSHIBA TEC", "TOSHIBA TECHNO"
), group_id = c(NA, NA, 606L, 606L, 606L, 606L, 606L, NA, 606L, 
606L, NA, NA, NA, 606L, 607L, NA, 608L, 609L, NA, NA, NA, 609L, 
609L, 610L, NA, 611L, 612L, NA, 613L, NA, 614L, NA, NA, NA, NA, 
NA, NA, NA, 614L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, 705L, NA, NA, NA, NA, 1631L, NA, NA, NA, NA, 1631L, NA, 
NA, 1631L, 1631L, NA, 1631L, NA, NA, NA, NA, 1631L, NA, NA)), row.names = c(NA, 
-76L), class = "data.frame")

CodePudding user response:

Allthough I do not exactly grasp the kind of granulation of your join, I believe that you need this kind of code:

The only thing you have to define is the max_dist:

With max_dist we could define the Maximum distance to use for joining

See: ?stringdist_left_join

library(dplyr)
library(fuzzyjoin)
library(tidyr)

fuzzyjoin::stringdist_full_join(x=dt, y=dt2, max_dist = .70, 
                                by='name_std', 
                                method = 'jaccard', 
                                distance_col = "dist") %>% 
  group_by(name_std.x) %>% # here you may want to group by psn_name_PAT.x
  fill(group_id.y, .direction = "updown") %>% 
  select(psn_name_PAT = psn_name_PAT.y, n=n.y, group_id=group_id.y)
    name_std.x psn_name_PAT                                           n group_id
   <chr>      <chr>                                              <int>    <int>
 1 FUJI       CHICHIBU FUJI COMPANY                                  2      606
 2 FUJI       FUJI CERAMICS                                          1      606
 3 FUJI       FUJI CHEMICAL                                          1      606
 4 FUJI       FUJI ELECTRIC COMPANY                                439      606
 5 FUJI       FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT         5      606
 6 FUJI       FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT      12      606
 7 FUJI       FUJI ELECTRIC CORPORATION RESEARCH AND DEVELOPMENT     1      606
 8 FUJI       FUJI ELECTRIC DEVICE TECHNOLOGY COMPANY               43      606
 9 FUJI       FUJI ELECTRIC SYSTEMS COMPANY                         38      606
10 FUJI       FUJI ELECTRIC TECHNOLOGY COMPANY                       1      606
# ... with 1,182 more rows

CodePudding user response:


library(dplyr)
library(stringr)
library(tibble)


# getting the company name from dt for which we have group id 
company <- dt %>% as_tibble() %>% 
  distinct(group_id, .keep_all = TRUE) %>% 
  select(name_std, group_id)

# creating the pattern for matching 
company_name <- company %>% 
  arrange(desc(name_std)) %>% 
  pull(name_std) %>% 
  str_c(collapse = "|") %>% 
  paste0("\\b",., "\\b")


dt2 <- dt2 %>% 
  as_tibble() %>% 
  mutate(
    company_name = str_extract(name_std, company_name)
  ) %>% 
  left_join(company, by = c("company_name" = "name_std")) %>% 
  select(company_name, name_std, psn_name_PAT, group_id = group_id.y) %>% 
  arrange(group_id)

print(dt2, n = Inf)

#> # A tibble: 76 × 4
#>    company_name name_std                         psn_name_PAT           group_id
#>    <chr>        <chr>                            <chr>                     <int>
#>  1 FUJI         CHICHIBU FUJI                    CHICHIBU FUJI COMPANY       606
#>  2 FUJI         FUJI CERAMICS                    FUJI CERAMICS               606
#>  3 FUJI         FUJI                             FUJI CHEMICAL               606
#>  4 FUJI         FUJI                             FUJI ELECTRIC COMPANY       606
#>  5 FUJI         FUJI                             FUJI ELECTRIC CORPORA…      606
#>  6 FUJI         FUJI                             FUJI ELECTRIC CORPORA…      606
#>  7 FUJI         FUJI                             FUJI ELECTRIC CORPORA…      606
#>  8 FUJI         FUJI DEVICE                      FUJI ELECTRIC DEVICE …      606
#>  9 FUJI         FUJI                             FUJI ELECTRIC SYSTEMS…      606
#> 10 FUJI         FUJI                             FUJI ELECTRIC TECHNOL…      606
#> 11 FUJI         FUJI ELECTROCHEMICAL             FUJI ELECTROCHEMICAL …      606
#> 12 FUJI         FUJI MICRODEVICES                FUJI FILM MICRODEVICE…      606
#> 13 FUJI         FUJI HEAVY IND                   FUJI HEAVY IND              606
#> 14 FUJI         FUJI                             FUJI MACHINE MANUFACT…      606
#> 15 FUJI         FUJI SEIKI WORKS                 FUJI SEIKI MACHINE WO…      606
#> 16 FUJI MFG     FUJI MFG                         FUJI MACHINERY MFG. &…      607
#> 17 FUJI XEROX   FUJI XEROX                       FUJI XEROX COMPANY          608
#> 18 FUJI XEROX   SUZUKA FUJI XEROX                SUZUKA FUJI XEROX COM…      608
#> 19 FUJIFILM     FUJIFILM                         FUJIFILM                    609
#> 20 FUJIFILM     FUJIFILM DIAMATIX                FUJIFILM DIAMATIX           609
#> 21 FUJIFILM     FUJIFILM DIMATIX                 FUJIFILM DIMATIX            609
#> 22 FUJIFILM     FUJIFILM IMAGING                 FUJIFILM ELECTRONIC I…      609
#> 23 FUJIFILM     FUJIFILM                         FUJIFILM ELECTRONIC M…      609
#> 24 FUJIFILM     FUJIFILM                         FUJIFILM MANUFACTURIN…      609
#> 25 FUJIKIN      FUJIKIN                          FUJIKIN                     610
#> 26 FUJIKOSHI    FUJIKOSHI KIKAI                  FUJIKOSHI KIKAI KOGYO       611
#> 27 FUJIKOSHI    FUJIKOSHI                        FUJIKOSHI MACHINERY C…      611
#> 28 FUJIKURA     FUJIKURA                         FUJIKURA                    612
#> 29 FUJIMI       FUJIMI                           FUJIMI                      613
#> 30 FUJITSU      FUJITSU                          FUJITSU                     614
#> 31 FUJITSU      FUJITSU AMD SEMICONDUCTOR        FUJITSU AMD SEMICONDU…      614
#> 32 FUJITSU      FUJITSU AMD SEMICONDUCTOR FASL   FUJITSU AMD SEMICONDU…      614
#> 33 FUJITSU      FUJITSU SEMICONDUCTOR            FUJITSU AND SEMICONDU…      614
#> 34 FUJITSU      FUJITSU AUTOMATION               FUJITSU AUTOMATION          614
#> 35 FUJITSU      FUJITSU COMPONENT                FUJITSU COMPONENT           614
#> 36 FUJITSU      FUJITSU DISPLAY                  FUJITSU DISPLAY TECHN…      614
#> 37 FUJITSU      FUJITSU FRONTECH                 FUJITSU FRONTECH            614
#> 38 FUJITSU      FUJITSU                          FUJITSU GENERAL             614
#> 39 FUJITSU      FUJITSU HITACHI PLASMA DISPLAY   FUJITSU HITACHI PLASM…      614
#> 40 FUJITSU      FUJITSU KABUSHIKI KAISHA TOSHIBA FUJITSU LIMITED KABUS…      614
#> 41 FUJITSU      FUJITSU DEVICES                  FUJITSU MEDIA DEVICES       614
#> 42 FUJITSU      FUJITSU MICROCOMPUTER            FUJITSU MICROCOMPUTER…      614
#> 43 FUJITSU      FUJITSU MICROELECTRONICS         FUJITSU MICROELECTRON…      614
#> 44 FUJITSU      FUJITSU MICROELECTRONICS LMIITED FUJITSU MICROELECTRON…      614
#> 45 FUJITSU      FUJITSU MIYAGI                   FUJITSU MIYAGI ELECTR…      614
#> 46 FUJITSU      FUJITSU QUANTUM DEVICES          FUJITSU QUANTUM DEVIC…      614
#> 47 FUJITSU      FUJITSU SEMICONDUCTOR            FUJITSU SEMICONDUCTOR       614
#> 48 FUJITSU      FUJITSU TAKAMISAWA COMPONENT     FUJITSU TAKAMISAWA CO…      614
#> 49 FUJITSU      FUJITSU TEN                      FUJITSU TEN                 614
#> 50 FUJITSU      FUJITSU VLSI                     FUJITSU VLSI                614
#> 51 FUJITSU      FUJITSU YAMANASHI                FUJITSU YAMANASHI ELE…      614
#> 52 FUJITSU      KYUSHU FUJITSU                   KYUSHU FUJITSU ELECTR…      614
#> 53 HONGFUJIN    HONGFUJIN                        HONGFUJIN PRECISION I…      705
#> 54 TOSHIBA      NEC TOSHIBA SPACE                NEC TOSHIBA SPACE SYS…     1631
#> 55 TOSHIBA      TOSHIBA COMPONENTS               TOSHIBA AMERICA ELECT…     1631
#> 56 TOSHIBA      TOSHIBA                          TOSHIBA AMERICA RESEA…     1631
#> 57 TOSHIBA      TOSHIBA AUTOMATION               TOSHIBA AUTOMATION CO…     1631
#> 58 TOSHIBA      TOSHIBA AVE                      TOSHIBA AVE CORPORATI…     1631
#> 59 TOSHIBA      TOSHIBA CERAMICS                 TOSHIBA CERAMICS COMP…     1631
#> 60 TOSHIBA      TOSHIBA COMPONENTS               TOSHIBA COMPONENTS CO…     1631
#> 61 TOSHIBA      TOSHIBA                          TOSHIBA CORPORATION        1631
#> 62 TOSHIBA      TOSHIBA KIKAI                    TOSHIBA KIKAI              1631
#> 63 TOSHIBA      TOSHIBA LIGHTING                 TOSHIBA LIGHTING & TE…     1631
#> 64 TOSHIBA      TOSHIBA                          TOSHIBA MACHINE COMPA…     1631
#> 65 TOSHIBA      TOSHIBA                          TOSHIBA MATERIALS COM…     1631
#> 66 TOSHIBA      TOSHIBA MATSUSHITA DISPLAY       TOSHIBA MATSUSHITA DI…     1631
#> 67 TOSHIBA      TOSHIBA                          TOSHIBA MEDICAL SYSTE…     1631
#> 68 TOSHIBA      TOSHIBA MICROELECTRONICS         TOSHIBA MICROELECTRON…     1631
#> 69 TOSHIBA      TOSHIBA MOBILE DISPLAY           TOSHIBA MOBILE DISPLA…     1631
#> 70 TOSHIBA      TOSHIBA SHIBAURA DENKI           TOSHIBA SHIBAURA DENKI     1631
#> 71 TOSHIBA      TOSHIBA SILICONE                 TOSHIBA SILICONE COMP…     1631
#> 72 TOSHIBA      TOSHIBA                          TOSHIBA SOLUTIONS COR…     1631
#> 73 TOSHIBA      TOSHIBA TEC                      TOSHIBA TEC CORPORATI…     1631
#> 74 TOSHIBA      TOSHIBA TECHNO                   TOSHIBA TECHNO CENTER      1631
#> 75 <NA>         FUJIMA                           FUJIMA                       NA
#> 76 <NA>         FUJINON                          FUJINON CORPORATION          NA

Created on 2022-07-11 by the reprex package (v2.0.1)

  • Related