Merging/Scanning in R using like operator-CodePudding

I have two data frames in these formats.

df1 <- data.frame (Year  = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
                   Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA", 
                              "FAMILY BROS", "family, apple, ibm","family co.")
)

df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))

I need to create a data frame that shows each firm and its corresponding year of being a winner as illustrated in Data3 in the attached figure. I checked few links like this one Merge tables in R using like where they use a like operator but am unable to create the desired data as there can be multiple winners in a year. Please suggest what functions should I try to create Data3. Thanks!

Figure - Desired Data Format

CodePudding user response：

Using adist basically.

sp <- strsplit(df1$Winner, ',|;') |> lapply(trimws)
sp <- t(sapply(sp, `length<-`, max(lengths(sp)))) |> as.data.frame() |> cbind(Year=df1$Year)
sp <- reshape(sp, 1:3, idvar=4, direction='l', sep='') |> na.omit()
sp$Firm <- cutree(hclust(as.dist(adist(gsub('inc|co', '', tolower(sp$V))))), 4) |>
  factor(labels=c('Apple', 'Sonata Inc.', 'Family Bros. Co.', 'IBM'))
subset(sp[order(sp$Firm), ], select=c(Firm, Year))
#                 Firm Year
# 1.1            Apple 1991
# 2.1            Apple 1992
# 3.1            Apple 1993
# 6.2            Apple 1996
# 4.1      Sonata Inc. 1994
# 5.1 Family Bros. Co. 1995
# 6.1 Family Bros. Co. 1996
# 7.1 Family Bros. Co. 1997
# 3.2              IBM 1993
# 6.3              IBM 1996

CodePudding user response：

Try this

df <- sapply(gsub("\\s[a-zA-Z] \\W" , "" ,trimws(df2$Firm)), 
       function(x) grepl(tolower(x) ,
                         tolower(df1$Winner)))

l <- lapply(data.frame(df), function(x) df1$Year[x])

l

If you want the answer in data.frame use

ans <- data.frame(Firm = gsub("[0-9] ","",names(unlist(l))) ,
 year = unlist(l))
row.names(ans) <- NULL
ans

CodePudding user response：

Using fuzzyjoin.

(Use the second example only if the precise ordering matters.)

library(tidyverse)
library(fuzzyjoin)

# Data
df1 <- data.frame (Year  = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
                   Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA", 
                              "FAMILY BROS", "family, apple, ibm","family co.")
)

df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))

# If the order is unimportant
df1_sep <- df1 |> 
  separate_rows(Winner) |> 
  filter(!Winner %in% c("", "CO.", "inc.", "co.", "INC.", "BROS"))

df2 |> 
  mutate(Firm = str_squish(Firm)) |> 
  regex_right_join(df1_sep, by = c("Firm" = "Winner"), ignore_case = TRUE) |> 
  arrange(Firm, Year) |> 
  select(-Winner)

#>                Firm Year
#> 1             APPLE 1991
#> 2             APPLE 1992
#> 3             APPLE 1993
#> 4             APPLE 1996
#> 5  Family Bros. Co. 1995
#> 6  Family Bros. Co. 1996
#> 7  Family Bros. Co. 1997
#> 8               IBM 1993
#> 9               IBM 1996
#> 10      Sonata Inc. 1994



# If desired output order matters
df1_sep <- df1 |> 
  separate_rows(Winner) |> 
  filter(!Winner %in% c("", "CO.", "inc.", "co.", "INC.", "BROS"))

df2 |> 
  mutate(Firm = str_squish(Firm)) |> 
  regex_right_join(df1_sep, by = c("Firm" = "Winner"), ignore_case = TRUE) |> 
  group_by(Firm) |>
  mutate(sort = min(Year)) |>
  ungroup() |>
  arrange(sort, Year) |> 
  select(-Winner, -sort)

#> # A tibble: 10 × 2
#>    Firm              Year
#>    <chr>            <dbl>
#>  1 APPLE             1991
#>  2 APPLE             1992
#>  3 APPLE             1993
#>  4 APPLE             1996
#>  5 IBM               1993
#>  6 IBM               1996
#>  7 Sonata Inc.       1994
#>  8 Family Bros. Co.  1995
#>  9 Family Bros. Co.  1996
#> 10 Family Bros. Co.  1997

^{Created on 2022-06-18 by the reprex package (v2.0.1)}

CodePudding user response：

Base R, sure a simpler solution exists:

# Split each winning company up into separate elements in a list 
# of character vectors: winning_companies => list of character vectors
winning_companies <- strsplit(
  df1$Winner, 
  "\\;|\\,"
)

# Unroll the data.frame: df1_unrolled => data.frame
df1_unrolled <- data.frame(
  do.call(
    rbind,
    lapply(
      seq_len(nrow((df1))), 
      function(i){
        transform(
          df1[rep(i, length(winning_companies[[i]])),],
          Winner = trimws(unlist(winning_companies[[i]]), "both")
        )
      }
    )
  ),
  stringsAsFactors = FALSE,
  row.names = NULL
)

# Clean up the search terms: firm_names_std => character vector
df2$firm_names_std <- trimws(
  gsub(
    "\\w \\.", 
    "", 
    tolower(
      df2$Firm
    )
  ),
  "both"
)

# Resolve a dictionary to be used to lookup items:
# firm_dictionary => character vector
firm_dictionary <- names(
  sort(
    table(
      df2$firm_names_std
    ),
    decreasing = TRUE
  )
)


# Function to correct the spelling: correct_spelling => function
correct_spelling <- function(firm_name_vec, firm_dictionary, similarity_threshold = NULL) {
  # Derive the similarity threshold: st => integer scalar
  st <- similarity_threshold
  # Clean the words: firm_name => string scalar
  clean_firm_names <- trimws(
    gsub(
      "\\w \\.", 
      "", 
      tolower(
        firm_name_vec
      )
    ),
    "both"
  )
  # Function to correct the spelling at a scalar level: 
  # .correct_spelling_scalar => function
  .correct_spelling_scalar <- function(firm_name, firm_dictionary, similarity_threshold = st){
    # Calculate the levenshtein distance between the cleaned word
    # and each element in the dictionary: distance_from_dict => double vector
    distance_from_dict <- adist(firm_name, firm_dictionary, partial = TRUE)
    # If we are not using a similarity threhold: 
    if(is.null(similarity_threshold)){
      # Resolve the intermediate result: ir => character scalar
      ir <- firm_dictionary[which.min(distance_from_dict)]
    # Otherwise: 
    }else{
      # Count the number of characters of each element in the dictionary 
      # vector: n => integer vector
      n <- nchar(firm_dictionary)
      # Calculate the ratio between the number of characters differing between
      # each term in the dictionary and the total of number characters 
      # for a given dictionary element: dist_ratio => double vector
      dist_ratio <- distance_from_dict / n
      # Check if distance in ratio form is within the threshold:
      # selection_idx => logical vector
      selection_idx <- dist_ratio <= similarity_threshold
      # Resolve the intermediate result: ir => character scalar
      ir <- firm_dictionary[selection_idx]
    }
    # Resolve company name: res => string scalar
    res <- head(
      c(
        ir, 
        NA_character_
      ),
      1
    )
    # Explicitly define the returned object: character scalar => env
    return(res)
  }
  
  # Apply function to a vector: res => character vector
  res <- vapply(
    clean_firm_names, 
    function(x){
      .correct_spelling_scalar(x, firm_dictionary)
    },
    character(1),
    USE.NAMES = FALSE
  )
  
  # Explicitly define the returned object: character vector => env
  return(res)
}

# Derive the correct spelling of the firms: 
# cleaned_firm_names => character vector
cleaned_firm_names <- correct_spelling(
  df1_unrolled$Winner, 
  firm_dictionary
)

# Use the cleaned firm names to look up the formatted names in df2: 
# df3 => data.frame
df3 <- transform(
  df1_unrolled, 
  Winner = trimws(
    df2$Firm[match(cleaned_firm_names, df2$firm_names_std)],
    "both"
  )
)

# Output result to console: data.frame => stdout(console)
df3

Data:

df1 <- data.frame (Year  = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
                   Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA", 
                              "FAMILY BROS", "family, apple, ibm","family co.")
)

df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))