I have two data frames in these formats.
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))
I need to create a data frame that shows each firm and its corresponding year of being a winner as illustrated in Data3 in the attached figure. I checked few links like this one Merge tables in R using like where they use a like operator but am unable to create the desired data as there can be multiple winners in a year. Please suggest what functions should I try to create Data3. Thanks!
CodePudding user response:
Using adist
basically.
sp <- strsplit(df1$Winner, ',|;') |> lapply(trimws)
sp <- t(sapply(sp, `length<-`, max(lengths(sp)))) |> as.data.frame() |> cbind(Year=df1$Year)
sp <- reshape(sp, 1:3, idvar=4, direction='l', sep='') |> na.omit()
sp$Firm <- cutree(hclust(as.dist(adist(gsub('inc|co', '', tolower(sp$V))))), 4) |>
factor(labels=c('Apple', 'Sonata Inc.', 'Family Bros. Co.', 'IBM'))
subset(sp[order(sp$Firm), ], select=c(Firm, Year))
# Firm Year
# 1.1 Apple 1991
# 2.1 Apple 1992
# 3.1 Apple 1993
# 6.2 Apple 1996
# 4.1 Sonata Inc. 1994
# 5.1 Family Bros. Co. 1995
# 6.1 Family Bros. Co. 1996
# 7.1 Family Bros. Co. 1997
# 3.2 IBM 1993
# 6.3 IBM 1996
CodePudding user response:
Try this
df <- sapply(gsub("\\s[a-zA-Z] \\W" , "" ,trimws(df2$Firm)),
function(x) grepl(tolower(x) ,
tolower(df1$Winner)))
l <- lapply(data.frame(df), function(x) df1$Year[x])
l
If you want the answer in data.frame use
ans <- data.frame(Firm = gsub("[0-9] ","",names(unlist(l))) ,
year = unlist(l))
row.names(ans) <- NULL
ans
CodePudding user response:
Using fuzzyjoin.
(Use the second example only if the precise ordering matters.)
library(tidyverse)
library(fuzzyjoin)
# Data
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))
# If the order is unimportant
df1_sep <- df1 |>
separate_rows(Winner) |>
filter(!Winner %in% c("", "CO.", "inc.", "co.", "INC.", "BROS"))
df2 |>
mutate(Firm = str_squish(Firm)) |>
regex_right_join(df1_sep, by = c("Firm" = "Winner"), ignore_case = TRUE) |>
arrange(Firm, Year) |>
select(-Winner)
#> Firm Year
#> 1 APPLE 1991
#> 2 APPLE 1992
#> 3 APPLE 1993
#> 4 APPLE 1996
#> 5 Family Bros. Co. 1995
#> 6 Family Bros. Co. 1996
#> 7 Family Bros. Co. 1997
#> 8 IBM 1993
#> 9 IBM 1996
#> 10 Sonata Inc. 1994
# If desired output order matters
df1_sep <- df1 |>
separate_rows(Winner) |>
filter(!Winner %in% c("", "CO.", "inc.", "co.", "INC.", "BROS"))
df2 |>
mutate(Firm = str_squish(Firm)) |>
regex_right_join(df1_sep, by = c("Firm" = "Winner"), ignore_case = TRUE) |>
group_by(Firm) |>
mutate(sort = min(Year)) |>
ungroup() |>
arrange(sort, Year) |>
select(-Winner, -sort)
#> # A tibble: 10 × 2
#> Firm Year
#> <chr> <dbl>
#> 1 APPLE 1991
#> 2 APPLE 1992
#> 3 APPLE 1993
#> 4 APPLE 1996
#> 5 IBM 1993
#> 6 IBM 1996
#> 7 Sonata Inc. 1994
#> 8 Family Bros. Co. 1995
#> 9 Family Bros. Co. 1996
#> 10 Family Bros. Co. 1997
Created on 2022-06-18 by the reprex package (v2.0.1)
CodePudding user response:
Base R, sure a simpler solution exists:
# Split each winning company up into separate elements in a list
# of character vectors: winning_companies => list of character vectors
winning_companies <- strsplit(
df1$Winner,
"\\;|\\,"
)
# Unroll the data.frame: df1_unrolled => data.frame
df1_unrolled <- data.frame(
do.call(
rbind,
lapply(
seq_len(nrow((df1))),
function(i){
transform(
df1[rep(i, length(winning_companies[[i]])),],
Winner = trimws(unlist(winning_companies[[i]]), "both")
)
}
)
),
stringsAsFactors = FALSE,
row.names = NULL
)
# Clean up the search terms: firm_names_std => character vector
df2$firm_names_std <- trimws(
gsub(
"\\w \\.",
"",
tolower(
df2$Firm
)
),
"both"
)
# Resolve a dictionary to be used to lookup items:
# firm_dictionary => character vector
firm_dictionary <- names(
sort(
table(
df2$firm_names_std
),
decreasing = TRUE
)
)
# Function to correct the spelling: correct_spelling => function
correct_spelling <- function(firm_name_vec, firm_dictionary, similarity_threshold = NULL) {
# Derive the similarity threshold: st => integer scalar
st <- similarity_threshold
# Clean the words: firm_name => string scalar
clean_firm_names <- trimws(
gsub(
"\\w \\.",
"",
tolower(
firm_name_vec
)
),
"both"
)
# Function to correct the spelling at a scalar level:
# .correct_spelling_scalar => function
.correct_spelling_scalar <- function(firm_name, firm_dictionary, similarity_threshold = st){
# Calculate the levenshtein distance between the cleaned word
# and each element in the dictionary: distance_from_dict => double vector
distance_from_dict <- adist(firm_name, firm_dictionary, partial = TRUE)
# If we are not using a similarity threhold:
if(is.null(similarity_threshold)){
# Resolve the intermediate result: ir => character scalar
ir <- firm_dictionary[which.min(distance_from_dict)]
# Otherwise:
}else{
# Count the number of characters of each element in the dictionary
# vector: n => integer vector
n <- nchar(firm_dictionary)
# Calculate the ratio between the number of characters differing between
# each term in the dictionary and the total of number characters
# for a given dictionary element: dist_ratio => double vector
dist_ratio <- distance_from_dict / n
# Check if distance in ratio form is within the threshold:
# selection_idx => logical vector
selection_idx <- dist_ratio <= similarity_threshold
# Resolve the intermediate result: ir => character scalar
ir <- firm_dictionary[selection_idx]
}
# Resolve company name: res => string scalar
res <- head(
c(
ir,
NA_character_
),
1
)
# Explicitly define the returned object: character scalar => env
return(res)
}
# Apply function to a vector: res => character vector
res <- vapply(
clean_firm_names,
function(x){
.correct_spelling_scalar(x, firm_dictionary)
},
character(1),
USE.NAMES = FALSE
)
# Explicitly define the returned object: character vector => env
return(res)
}
# Derive the correct spelling of the firms:
# cleaned_firm_names => character vector
cleaned_firm_names <- correct_spelling(
df1_unrolled$Winner,
firm_dictionary
)
# Use the cleaned firm names to look up the formatted names in df2:
# df3 => data.frame
df3 <- transform(
df1_unrolled,
Winner = trimws(
df2$Firm[match(cleaned_firm_names, df2$firm_names_std)],
"both"
)
)
# Output result to console: data.frame => stdout(console)
df3
Data:
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))