I found this very useful post, but I'm probably doing something wrong. I'm trying to merge myData1
and myData2
. Both dataframes contain results from participants (ID) on different tests (which I've names 'variables'). Thing is: not all participants took myData2
set of tests. I'm trying to merge ONLY the data from participants who took both tests. Thing is: I'm always getting very weird dataframes with lots of nulls.
Note: I've tried all
merge
methods and thefull/inner/left/right_join()
family as well. I believe I'm mispecifying something... Example:problem:
merge(myData1, myData2, by = "ID", all.x = T)
ID VAR1 VAR2 VAR3 VAR1_z VAR2_z VAR3_z newVar1 newvar2 newvar3 newvar4 newvar5
1 1-101 490.55 28 24 0.60 0.29 0.64 <NA> <NA> <NA> <NA> <NA>
2 107-202 447.89 29 28 -0.29 0.43 1.59 <NA> <NA> <NA> <NA> <NA>
3 11-101 427.93 23 24 -0.70 -0.42 0.64 <NA> <NA> <NA> <NA> <NA>
4 12-101 483.07 34 19 0.44 1.13 -0.56 <NA> <NA> <NA> <NA> <NA>
5 16-101 531.65 29 24 1.45 0.43 0.64 <NA> <NA> <NA> <NA> <NA>
6 18-101 426.33 34 20 -0.74 1.13 -0.32 <NA> <NA> <NA> <NA> <NA>
- desired output:
a merged dataframe containing only the information concerning the participants (ID) from myData1
. I need their info from all variables from data1
and data2
(it's ok if they don't have one or another var in data2)
Any ideas? Thanks in adv.
- data:
> dput(myData1)
structure(list(ID = c("1-101", "107-202", "11-101", "12-101",
"16-101", "18-101", "19-101", "22-101", "3-101", "31-102", "32-102",
"33-102", "36-102", "38-102", "40-102", "46-102", "52-102", "58-201",
"62-201", "63-201", "64-201", "65-201", "66-201", "7-101", "72-201",
"74-201", "75-201", "77-201", "80-201", "9-101"), VAR1 = c(490.55,
447.89, 427.93, 483.07, 531.65, 426.33, 461.05, 415.66, 522.57,
414.31, 474.58, 522.91, 430.2, 474.25, 380.24, 536.81, 454.52,
458.95, 474.02, 451.87, 466.27, 419.87, 529.83, 464.63, 583.49,
445.63, 389.2, 403.18, 413.75, 456.42), VAR2 = c(28, 29, 23,
34, 29, 34, 21, 24, 33, 22, 27, 25, 23, 33, 11, 24, 11, 23, 23,
25, 23, 29, 25, 20, 25, 36, 47, 23, 30, 19), VAR3 = c(24, 28,
24, 19, 24, 20, 18, 24, 17, 23, 24, 23, 18, 19, 18, 18, 17, 22,
14, 21, 29, 25, 18, 16, 27, 18, 24, 31, 19, 18), VAR1_z = c(0.6,
-0.29, -0.7, 0.44, 1.45, -0.74, -0.01, -0.96, 1.27, -0.99, 0.27,
1.27, -0.66, 0.26, -1.7, 1.56, -0.15, -0.06, 0.26, -0.2, 0.09,
-0.87, 1.42, 0.06, 2.53, -0.33, -1.51, -1.22, -1, -0.11), VAR2_z = c(0.29,
0.43, -0.42, 1.13, 0.43, 1.13, -0.7, -0.28, 0.99, -0.56, 0.15,
-0.14, -0.42, 0.99, -2.11, -0.28, -2.11, -0.42, -0.42, -0.14,
-0.42, 0.43, -0.14, -0.84, -0.14, 1.42, 2.97, -0.42, 0.57, -0.98
), VAR3_z = c(0.64, 1.59, 0.64, -0.56, 0.64, -0.32, -0.8, 0.64,
-1.04, 0.4, 0.64, 0.4, -0.8, -0.56, -0.8, -0.8, -1.04, 0.16,
-1.75, -0.08, 1.83, 0.88, -0.8, -1.28, 1.36, -0.8, 0.64, 2.31,
-0.56, -0.8)), row.names = c(NA, -30L), class = c("tbl_df", "tbl",
"data.frame"))
> dput(myData2)
structure(list(ID = c("1_101", "10_101", "100_202", "101_202",
"102_202", "105_202", "106_202", "107_202", "11_101", "12_101",
"13_101", "14_101", "16_101", "18_101", "19_101", "20_101", "22_101",
"26_101", "27_101", "28_101", "3_101", "30_102", "31_102", "32_102",
"33_102", "34_102", "36_102", "37_102", "38_102", "39_102", "4_101",
"40_102", "42_102", "44_102", "46_102", "47_102", "48_102", "5_101",
"51_102", "52_102", "53_102", "56_102", "57_201", "58_201", "60_201",
"61_201", "62_201", "63_201", "64_201", "65_201", "66_201", "67_201",
"68_201", "69_201", "7_101", "72_201", "73_201", "74_201", "75_201",
"77_201", "78_201", "80_201", "83_202", "84_202", "85_202", "87_202",
"88_202", "89_202", "9_101", "94_202", "98_202"), newVar1 = c("70,18",
"67,65", "71,89", "70,42", "", "72,38", "69,67", "75,63", "76,7",
"76,21", "66,5", "70,57", "76,72", "66,4", "74,75", "79,17",
"70,84", "", "67,82", "70", "71,88", "74,55", "69,33", "69,5",
"65,25", "75,05", "75,44", "64,56", "74,88", "74,29", "72,4",
"71,93", "", "69,12", "71,43", "77,53", "", "71,93", "70,4",
"60,25", "", "", "64,8", "69", "", "71,19", "71,12", "75,04",
"68,89", "68,26", "75,81", "", "", "", "75,89", "68,82", "77,35",
"68,38", "76,71", "79,12", "78,89", "73,5", "", "69,7", "69,82",
"70,91", "", "72", "71,17", "71,85", "69,7"), newvar2 = c("65,7",
"65,8", "74,45", "68", "", "", "53,75", "73,94", "67,24", "58,22",
"", "", "71,07", "68,07", "", "69,88", "71,32", "62,18", "58,65",
"76,45", "71,13", "67,25", "", "51,76", "69,33", "68,17", "58",
"54,27", "68,05", "", "", "61", "61,67", "", "67,79", "65,93",
"", "", "59,27", "69,67", "71,38", "70", "", "64,88", "68,19",
"62,06", "61", "55,48", "65,67", "67,72", "68,47", "64", "65,11",
"66", "67,5", "66,33", "", "69,61", "69,33", "75,67", "68,17",
"63", "", "58,81", "", "", "", "66,5", "62,33", "65", ""), newvar3 = c("68,53",
"65,9", "69,78", "68,29", "", "69,5", "67,05", "73,74", "73,59",
"72,57", "64,33", "67,79", "72,94", "63,75", "71,56", "75,5",
"68,16", "", "65,64", "68,36", "69,75", "72,73", "67,67", "66,19",
"62,94", "72,48", "72,19", "62,44", "72,5", "71,06", "70,4",
"69,14", "", "67,59", "69,1", "74,05", "", "68,6", "68,27", "59,12",
"", "", "63,7", "67,18", "", "68,38", "63,44", "72,56", "66,06",
"66,53", "73,19", "", "", "", "73,44", "67,45", "72,91", "65,81",
"73,96", "75", "75,89", "72", "", "68,2", "67,29", "69,91", "",
"69,67", "68,39", "69,2", "67,55"), newvar4 = c("64,9", "63,8",
"71,73", "67,67", "", "", "52,5", "72,35", "65,28", "57,22",
"", "", "69", "66,67", "", "66,58", "69", "60,55", "56,29", "67,45",
"68,4", "64,25", "", "50,86", "67,83", "65,96", "57", "53,07",
"66,89", "", "", "59", "61,5", "", "65,9", "64,07", "", "", "57,91",
"67,89", "68,75", "68,5", "", "63,24", "66,19", "60,59", "59,24",
"54,33", "64,39", "65,83", "65,71", "63", "63,78", "63,62", "64",
"65,08", "", "67,61", "67,57", "72,71", "65,46", "61,71", "",
"57,62", "", "", "", "64", "61,33", "62,64", ""), newvar5 = c("4,94",
"6,55", "2,44", "3", "", "3,25", "4,71", "2,84", "1,07", "2",
"5,33", "5,43", "1,72", "10,55", "3", "1,17", "5,8", "", "10,55",
"4,21", "2,94", "3,55", "6,33", "8,25", "5,88", "2", "3,44",
"9,22", "1,69", "4,18", "2,5", "4,71", "", "4,41", "5,9", "2,21",
"", "6,67", "3,33", "7", "", "", "8", "4,76", "", "4,44", "2,68",
"3,16", "4,94", "5,42", "2,81", "", "", "", "1,78", "6,09", "2,52",
"6,56", "1,96", "1,12", "0,67", "3,78", "", "3,5", "3,65", "5,27",
"", "4,33", "6,78", "3,6", "4,35")), class = "data.frame", row.names = c(NA,
-71L))
>
CodePudding user response:
The problem is that some ID's are using "-" and others "_", they must be equal so you can apply some function to change them, here an example:
library(dplyr)
library(stringr)
myData1 %>%
mutate(ID = str_replace(ID,"_","-")) %>%
left_join(
myData2 %>%
mutate(ID = str_replace(ID,"_","-"))
)