I have the two data frames (this is the heads):
1:
# A tibble: 6 x 2
twitterID Username
<chr> <chr>
1 849567328899616768 AchimKessler
2 1117749912 Achim_P
3 186552155 NA
4 172269309 agnieszka_mdb
5 1127961248493129728 StegemannAlbert
6 1178640571725955073 BDobrindt
2:
# A tibble: 6 x 3
Username TwitterID Name
<chr> <dbl> <chr>
1 achimkessler NA Achim Kessler
2 achim_p NA Achim Post
3 achim_p NA Achim Post
4 achim_p NA Achim Post (Minden)
5 NA NA Adis Ahmetovic
6 NA NA Agnes Alpers
I wish to join them by username to fill up the TwitterID column in dataframe 2 with the twitterID from dataframe 1.
df <- merge(x = 2, y = 1, by = "Username", all.x = TRUE)
... should do the job except it doesn't.
In the output df, there are many NA twitterID's for instances where the usernames do in fact match each other. If it is of any help, I can upload some larger data sets for you to test it. Any help is greatly appreciated.
CodePudding user response:
Just use tolower
in merge
if the capitalization in df1
is the problem.
merge(transform(df1, Username=tolower(Username)), df2, all=TRUE)
# Username twitterID TwitterID Name
# 1 achim_p 1117749912 NA Achim Post
# 2 achim_p 1117749912 NA Achim Post
# 3 achim_p 1117749912 NA Achim Post (Minden)
# 4 achimkessler 849567328899616768 NA Achim Kessler
# 5 agnieszka_mdb 172269309 NA <NA>
# 6 bdobrindt 1178640571725955073 NA <NA>
# 7 stegemannalbert 1127961248493129728 NA <NA>
# 8 <NA> 186552155 NA Adis Ahmetovic
# 9 <NA> 186552155 NA Agnes Alpers
Or:
merge(transform(df1, Username=tolower(Username)), df2[-2], all.y=TRUE) |>
(\(x) {x[, 'Name'] <- gsub('\\s \\(.*', '', x[, 'Name']);x})() |>
unique()
# Username twitterID Name
# 1 achim_p 1117749912 Achim Post
# 4 achimkessler 849567328899616768 Achim Kessler
# 5 <NA> 186552155 Adis Ahmetovic
# 6 <NA> 186552155 Agnes Alpers
Otherwise, if I were you, I would take a look at ?agrep
for approximate string matching.
> R.version.string
[1] "R version 4.1.2 (2021-11-01)"
Data:
df1 <- structure(list(twitterID = c("849567328899616768", "1117749912",
"186552155", "172269309", "1127961248493129728", "1178640571725955073"
), Username = c("AchimKessler", "Achim_P", NA, "agnieszka_mdb",
"StegemannAlbert", "BDobrindt")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
df2 <- structure(list(Username = c("achimkessler", "achim_p", "achim_p",
"achim_p", NA, NA), TwitterID = c(NA, NA, NA, NA, NA, NA), Name = c("Achim Kessler",
"Achim Post", "Achim Post", "Achim Post (Minden)", "Adis Ahmetovic",
"Agnes Alpers")), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6"))
CodePudding user response:
You can use tidyverse
. You can first make the usernames the same case, then join with the second dataframe, then use coalesce
to replace NAs with the IDs (if available).
library(tidyverse)
df1 %>%
dplyr::mutate(Username = tolower(Username)) %>%
dplyr::rename(TwitterID = twitterID) %>%
dplyr::left_join(., df2, by = "Username") %>%
dplyr::mutate(TwitterID = coalesce(TwitterID.x, TwitterID.y)) %>%
dplyr::select(-TwitterID.x,-TwitterID.y) %>%
distinct()
Output
Username Name TwitterID
1 achimkessler Achim Kessler 849567328899616768
2 achim_p Achim Post 1117749912
3 achim_p Achim Post (Minden) 1117749912
4 <NA> Adis Ahmetovic 186552155
5 <NA> Agnes Alpers 186552155
6 agnieszka_mdb <NA> 172269309
7 stegemannalbert <NA> 1127961248493129728
8 bdobrindt <NA> 1178640571725955073
Or you could still use merge
, but you would just need to make sure the dataframes are formatted the same (i.e., both username columns are in lower case).
df1 <- df1 %>%
dplyr::rename(TwitterID = twitterID) %>%
dplyr::mutate(Username = tolower(Username))
df <- merge(x = df2, y = df1, by = "Username", all.x = TRUE)
Output
Username TwitterID.x Name TwitterID.y
1 achim_p NA Achim Post 1117749912
2 achim_p NA Achim Post 1117749912
3 achim_p NA Achim Post (Minden) 1117749912
4 achimkessler NA Achim Kessler 849567328899616768
5 <NA> NA Adis Ahmetovic 186552155
6 <NA> NA Agnes Alpers 186552155
Data
df1 <-
structure(list(
twitterID = c(
"849567328899616768",
"1117749912",
"186552155",
"172269309",
"1127961248493129728",
"1178640571725955073"
),
Username = c(
"AchimKessler",
"Achim_P",
NA,
"agnieszka_mdb",
"StegemannAlbert",
"BDobrindt"
)
),
class = "data.frame",
row.names = c(NA,-6L))
df2 <-
structure(
list(
Username = c("achimkessler", "achim_p", "achim_p",
"achim_p", NA, NA),
TwitterID = c(NA, NA, NA, NA, NA, NA),
Name = c(
"Achim Kessler",
"Achim Post",
"Achim Post",
"Achim Post (Minden)",
"Adis Ahmetovic",
"Agnes Alpers"
)
),
class = "data.frame",
row.names = c(NA,-6L)
)