Home > other >  issue merging dataframes by 'id' column
issue merging dataframes by 'id' column

Time:12-20

I found this very useful post, but I'm probably doing something wrong. I'm trying to merge myData1 and myData2. Both dataframes contain results from participants (ID) on different tests (which I've names 'variables'). Thing is: not all participants took myData2 set of tests. I'm trying to merge ONLY the data from participants who took both tests. Thing is: I'm always getting very weird dataframes with lots of nulls.

  • Note: I've tried all merge methods and the full/inner/left/right_join() family as well. I believe I'm mispecifying something... Example:

  • problem:

merge(myData1, myData2, by = "ID", all.x = T)
        ID   VAR1 VAR2 VAR3 VAR1_z VAR2_z VAR3_z newVar1 newvar2 newvar3 newvar4 newvar5
1    1-101 490.55   28   24   0.60   0.29   0.64    <NA>    <NA>    <NA>    <NA>    <NA>
2  107-202 447.89   29   28  -0.29   0.43   1.59    <NA>    <NA>    <NA>    <NA>    <NA>
3   11-101 427.93   23   24  -0.70  -0.42   0.64    <NA>    <NA>    <NA>    <NA>    <NA>
4   12-101 483.07   34   19   0.44   1.13  -0.56    <NA>    <NA>    <NA>    <NA>    <NA>
5   16-101 531.65   29   24   1.45   0.43   0.64    <NA>    <NA>    <NA>    <NA>    <NA>
6   18-101 426.33   34   20  -0.74   1.13  -0.32    <NA>    <NA>    <NA>    <NA>    <NA>
  • desired output:

a merged dataframe containing only the information concerning the participants (ID) from myData1. I need their info from all variables from data1 and data2 (it's ok if they don't have one or another var in data2)

Any ideas? Thanks in adv.

  • data:
> dput(myData1)
structure(list(ID = c("1-101", "107-202", "11-101", "12-101", 
"16-101", "18-101", "19-101", "22-101", "3-101", "31-102", "32-102", 
"33-102", "36-102", "38-102", "40-102", "46-102", "52-102", "58-201", 
"62-201", "63-201", "64-201", "65-201", "66-201", "7-101", "72-201", 
"74-201", "75-201", "77-201", "80-201", "9-101"), VAR1 = c(490.55, 
447.89, 427.93, 483.07, 531.65, 426.33, 461.05, 415.66, 522.57, 
414.31, 474.58, 522.91, 430.2, 474.25, 380.24, 536.81, 454.52, 
458.95, 474.02, 451.87, 466.27, 419.87, 529.83, 464.63, 583.49, 
445.63, 389.2, 403.18, 413.75, 456.42), VAR2 = c(28, 29, 23, 
34, 29, 34, 21, 24, 33, 22, 27, 25, 23, 33, 11, 24, 11, 23, 23, 
25, 23, 29, 25, 20, 25, 36, 47, 23, 30, 19), VAR3 = c(24, 28, 
24, 19, 24, 20, 18, 24, 17, 23, 24, 23, 18, 19, 18, 18, 17, 22, 
14, 21, 29, 25, 18, 16, 27, 18, 24, 31, 19, 18), VAR1_z = c(0.6, 
-0.29, -0.7, 0.44, 1.45, -0.74, -0.01, -0.96, 1.27, -0.99, 0.27, 
1.27, -0.66, 0.26, -1.7, 1.56, -0.15, -0.06, 0.26, -0.2, 0.09, 
-0.87, 1.42, 0.06, 2.53, -0.33, -1.51, -1.22, -1, -0.11), VAR2_z = c(0.29, 
0.43, -0.42, 1.13, 0.43, 1.13, -0.7, -0.28, 0.99, -0.56, 0.15, 
-0.14, -0.42, 0.99, -2.11, -0.28, -2.11, -0.42, -0.42, -0.14, 
-0.42, 0.43, -0.14, -0.84, -0.14, 1.42, 2.97, -0.42, 0.57, -0.98
), VAR3_z = c(0.64, 1.59, 0.64, -0.56, 0.64, -0.32, -0.8, 0.64, 
-1.04, 0.4, 0.64, 0.4, -0.8, -0.56, -0.8, -0.8, -1.04, 0.16, 
-1.75, -0.08, 1.83, 0.88, -0.8, -1.28, 1.36, -0.8, 0.64, 2.31, 
-0.56, -0.8)), row.names = c(NA, -30L), class = c("tbl_df", "tbl", 
"data.frame"))
> dput(myData2)
structure(list(ID = c("1_101", "10_101", "100_202", "101_202", 
"102_202", "105_202", "106_202", "107_202", "11_101", "12_101", 
"13_101", "14_101", "16_101", "18_101", "19_101", "20_101", "22_101", 
"26_101", "27_101", "28_101", "3_101", "30_102", "31_102", "32_102", 
"33_102", "34_102", "36_102", "37_102", "38_102", "39_102", "4_101", 
"40_102", "42_102", "44_102", "46_102", "47_102", "48_102", "5_101", 
"51_102", "52_102", "53_102", "56_102", "57_201", "58_201", "60_201", 
"61_201", "62_201", "63_201", "64_201", "65_201", "66_201", "67_201", 
"68_201", "69_201", "7_101", "72_201", "73_201", "74_201", "75_201", 
"77_201", "78_201", "80_201", "83_202", "84_202", "85_202", "87_202", 
"88_202", "89_202", "9_101", "94_202", "98_202"), newVar1 = c("70,18", 
"67,65", "71,89", "70,42", "", "72,38", "69,67", "75,63", "76,7", 
"76,21", "66,5", "70,57", "76,72", "66,4", "74,75", "79,17", 
"70,84", "", "67,82", "70", "71,88", "74,55", "69,33", "69,5", 
"65,25", "75,05", "75,44", "64,56", "74,88", "74,29", "72,4", 
"71,93", "", "69,12", "71,43", "77,53", "", "71,93", "70,4", 
"60,25", "", "", "64,8", "69", "", "71,19", "71,12", "75,04", 
"68,89", "68,26", "75,81", "", "", "", "75,89", "68,82", "77,35", 
"68,38", "76,71", "79,12", "78,89", "73,5", "", "69,7", "69,82", 
"70,91", "", "72", "71,17", "71,85", "69,7"), newvar2 = c("65,7", 
"65,8", "74,45", "68", "", "", "53,75", "73,94", "67,24", "58,22", 
"", "", "71,07", "68,07", "", "69,88", "71,32", "62,18", "58,65", 
"76,45", "71,13", "67,25", "", "51,76", "69,33", "68,17", "58", 
"54,27", "68,05", "", "", "61", "61,67", "", "67,79", "65,93", 
"", "", "59,27", "69,67", "71,38", "70", "", "64,88", "68,19", 
"62,06", "61", "55,48", "65,67", "67,72", "68,47", "64", "65,11", 
"66", "67,5", "66,33", "", "69,61", "69,33", "75,67", "68,17", 
"63", "", "58,81", "", "", "", "66,5", "62,33", "65", ""), newvar3 = c("68,53", 
"65,9", "69,78", "68,29", "", "69,5", "67,05", "73,74", "73,59", 
"72,57", "64,33", "67,79", "72,94", "63,75", "71,56", "75,5", 
"68,16", "", "65,64", "68,36", "69,75", "72,73", "67,67", "66,19", 
"62,94", "72,48", "72,19", "62,44", "72,5", "71,06", "70,4", 
"69,14", "", "67,59", "69,1", "74,05", "", "68,6", "68,27", "59,12", 
"", "", "63,7", "67,18", "", "68,38", "63,44", "72,56", "66,06", 
"66,53", "73,19", "", "", "", "73,44", "67,45", "72,91", "65,81", 
"73,96", "75", "75,89", "72", "", "68,2", "67,29", "69,91", "", 
"69,67", "68,39", "69,2", "67,55"), newvar4 = c("64,9", "63,8", 
"71,73", "67,67", "", "", "52,5", "72,35", "65,28", "57,22", 
"", "", "69", "66,67", "", "66,58", "69", "60,55", "56,29", "67,45", 
"68,4", "64,25", "", "50,86", "67,83", "65,96", "57", "53,07", 
"66,89", "", "", "59", "61,5", "", "65,9", "64,07", "", "", "57,91", 
"67,89", "68,75", "68,5", "", "63,24", "66,19", "60,59", "59,24", 
"54,33", "64,39", "65,83", "65,71", "63", "63,78", "63,62", "64", 
"65,08", "", "67,61", "67,57", "72,71", "65,46", "61,71", "", 
"57,62", "", "", "", "64", "61,33", "62,64", ""), newvar5 = c("4,94", 
"6,55", "2,44", "3", "", "3,25", "4,71", "2,84", "1,07", "2", 
"5,33", "5,43", "1,72", "10,55", "3", "1,17", "5,8", "", "10,55", 
"4,21", "2,94", "3,55", "6,33", "8,25", "5,88", "2", "3,44", 
"9,22", "1,69", "4,18", "2,5", "4,71", "", "4,41", "5,9", "2,21", 
"", "6,67", "3,33", "7", "", "", "8", "4,76", "", "4,44", "2,68", 
"3,16", "4,94", "5,42", "2,81", "", "", "", "1,78", "6,09", "2,52", 
"6,56", "1,96", "1,12", "0,67", "3,78", "", "3,5", "3,65", "5,27", 
"", "4,33", "6,78", "3,6", "4,35")), class = "data.frame", row.names = c(NA, 
-71L))
> 

CodePudding user response:

The problem is that some ID's are using "-" and others "_", they must be equal so you can apply some function to change them, here an example:

library(dplyr)
library(stringr)
    
myData1 %>% 
  mutate(ID = str_replace(ID,"_","-")) %>% 
  left_join(
    myData2 %>% 
      mutate(ID = str_replace(ID,"_","-"))
    )
  • Related