Home > OS >  Merge two data frames by column name (merge() doesn't work)
Merge two data frames by column name (merge() doesn't work)

Time:12-15

I have the two data frames (this is the heads):

1:

# A tibble: 6 x 2
  twitterID           Username       
  <chr>               <chr>          
1 849567328899616768  AchimKessler   
2 1117749912          Achim_P        
3 186552155           NA             
4 172269309           agnieszka_mdb  
5 1127961248493129728 StegemannAlbert
6 1178640571725955073 BDobrindt 

2:

# A tibble: 6 x 3
  Username     TwitterID Name                
  <chr>            <dbl> <chr>               
1 achimkessler        NA Achim Kessler       
2 achim_p             NA Achim Post          
3 achim_p             NA Achim Post          
4 achim_p             NA Achim Post  (Minden)
5 NA                  NA Adis Ahmetovic      
6 NA                  NA Agnes Alpers  

I wish to join them by username to fill up the TwitterID column in dataframe 2 with the twitterID from dataframe 1.

df <- merge(x = 2, y = 1, by = "Username", all.x = TRUE)

... should do the job except it doesn't.

In the output df, there are many NA twitterID's for instances where the usernames do in fact match each other. If it is of any help, I can upload some larger data sets for you to test it. Any help is greatly appreciated.

CodePudding user response:

Just use tolower in merge if the capitalization in df1 is the problem.

merge(transform(df1, Username=tolower(Username)), df2, all=TRUE)
#          Username           twitterID TwitterID                 Name
# 1         achim_p          1117749912        NA           Achim Post
# 2         achim_p          1117749912        NA           Achim Post
# 3         achim_p          1117749912        NA Achim Post  (Minden)
# 4    achimkessler  849567328899616768        NA        Achim Kessler
# 5   agnieszka_mdb           172269309        NA                 <NA>
# 6       bdobrindt 1178640571725955073        NA                 <NA>
# 7 stegemannalbert 1127961248493129728        NA                 <NA>
# 8            <NA>           186552155        NA       Adis Ahmetovic
# 9            <NA>           186552155        NA         Agnes Alpers

Or:

merge(transform(df1, Username=tolower(Username)), df2[-2], all.y=TRUE) |>
  (\(x) {x[, 'Name'] <- gsub('\\s \\(.*', '', x[, 'Name']);x})() |>
  unique()
#       Username          twitterID           Name
# 1      achim_p         1117749912     Achim Post
# 4 achimkessler 849567328899616768  Achim Kessler
# 5         <NA>          186552155 Adis Ahmetovic
# 6         <NA>          186552155   Agnes Alpers

Otherwise, if I were you, I would take a look at ?agrep for approximate string matching.

> R.version.string
[1] "R version 4.1.2 (2021-11-01)"

Data:

df1 <- structure(list(twitterID = c("849567328899616768", "1117749912", 
"186552155", "172269309", "1127961248493129728", "1178640571725955073"
), Username = c("AchimKessler", "Achim_P", NA, "agnieszka_mdb", 
"StegemannAlbert", "BDobrindt")), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6"))

df2 <- structure(list(Username = c("achimkessler", "achim_p", "achim_p", 
"achim_p", NA, NA), TwitterID = c(NA, NA, NA, NA, NA, NA), Name = c("Achim Kessler", 
"Achim Post", "Achim Post", "Achim Post  (Minden)", "Adis Ahmetovic", 
"Agnes Alpers")), class = "data.frame", row.names = c("1", "2", 
"3", "4", "5", "6"))

CodePudding user response:

You can use tidyverse. You can first make the usernames the same case, then join with the second dataframe, then use coalesce to replace NAs with the IDs (if available).

library(tidyverse)

df1 %>%
  dplyr::mutate(Username = tolower(Username)) %>%
  dplyr::rename(TwitterID = twitterID) %>% 
  dplyr::left_join(., df2, by = "Username") %>%
  dplyr::mutate(TwitterID = coalesce(TwitterID.x, TwitterID.y)) %>%
  dplyr::select(-TwitterID.x,-TwitterID.y) %>% 
  distinct()

Output

         Username                Name           TwitterID
1    achimkessler       Achim Kessler  849567328899616768
2         achim_p          Achim Post          1117749912
3         achim_p Achim Post (Minden)          1117749912
4            <NA>      Adis Ahmetovic           186552155
5            <NA>        Agnes Alpers           186552155
6   agnieszka_mdb                <NA>           172269309
7 stegemannalbert                <NA> 1127961248493129728
8       bdobrindt                <NA> 1178640571725955073

Or you could still use merge, but you would just need to make sure the dataframes are formatted the same (i.e., both username columns are in lower case).

df1 <- df1 %>% 
  dplyr::rename(TwitterID = twitterID) %>% 
  dplyr::mutate(Username = tolower(Username))

df <- merge(x = df2, y = df1, by = "Username", all.x = TRUE)

Output

      Username TwitterID.x                Name        TwitterID.y
1      achim_p          NA          Achim Post         1117749912
2      achim_p          NA          Achim Post         1117749912
3      achim_p          NA Achim Post (Minden)         1117749912
4 achimkessler          NA       Achim Kessler 849567328899616768
5         <NA>          NA      Adis Ahmetovic          186552155
6         <NA>          NA        Agnes Alpers          186552155

Data

df1 <-
  structure(list(
    twitterID = c(
      "849567328899616768",
      "1117749912",
      "186552155",
      "172269309",
      "1127961248493129728",
      "1178640571725955073"
    ),
    Username = c(
      "AchimKessler",
      "Achim_P",
      NA,
      "agnieszka_mdb",
      "StegemannAlbert",
      "BDobrindt"
    )
  ),
  class = "data.frame",
  row.names = c(NA,-6L))

df2 <-
  structure(
    list(
      Username = c("achimkessler", "achim_p", "achim_p",
                   "achim_p", NA, NA),
      TwitterID = c(NA, NA, NA, NA, NA, NA),
      Name = c(
        "Achim Kessler",
        "Achim Post",
        "Achim Post",
        "Achim Post (Minden)",
        "Adis Ahmetovic",
        "Agnes Alpers"
      )
    ),
    class = "data.frame",
    row.names = c(NA,-6L)
  )
  • Related