Home > database >  error when attempting to combine two dataframes in dplyr
error when attempting to combine two dataframes in dplyr

Time:11-28

I have the following two dataframes. I'm trying to combine the two using a left_join by Key, however I'm getting an error

df <- structure(list(Key = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 
6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L, 
10L, 10L, 11L, 11L, 11L, 11L), levels = c("GC23", "GC24", "GC25", 
"GC26", "GC27", "GC28", "GC30", "GC35", "GC45", "GC48", "GC50"
), class = "factor"), Quartile = structure(c(1L, 2L, 3L, 4L, 
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), levels = c("1", "2", "3", "4"
), class = "factor"), min = c(0.800000000000001, 70.2, 102.9, 
124.4, -108.1, -0.200000000000067, 63.2, 124.4, -70.9999999999999, 
5.29999999999999, 67.0999999999999, 144.3, -52.1000000000001, 
16, 37.1, 51.6999999999999, -19.2, 75.0999999999999, 92.7999999999999, 
161.8, -45.5, -3.80000000000003, 12.7000000000001, 31.3000000000001, 
3.30000000000013, 107.9, 120.2, 143.4, 29.7000000000001, 102.1, 
138.3, 172.3, 83.9, 183.6, 216.6, 240.3, 202.1, 258.6, 290.9, 
321.9, 107.5, 201.1, 247.1, 290.1), max = c(70.1, 102.8, 124.3, 
342.6, -0.200000000000067, 63.1, 124.4, 190.2, 4.79999999999992, 
67.0999999999999, 144.2, 209.7, 16, 37.1, 51.3999999999999, 131.7, 
75, 92.7000000000001, 161.3, 250.3, -4.70000000000006, 12.5999999999999, 
30.1, 62.9, 107.8, 119.8, 143.2, 192.3, 102, 138.2, 172.3, 258, 
183.5, 216.6, 240.3, 349.4, 258.5, 290.9, 321.9, 374.5, 201.1, 
247, 289.6, 400.9)), row.names = c(NA, -44L), class = c("tbl_df", 
"tbl", "data.frame"))

df2 <- structure(list(Key = structure(1:11, levels = c("GC23", "GC24", 
"GC25", "GC26", "GC27", "GC28", "GC30", "GC35", "GC45", "GC48", 
"GC50"), class = "factor"), Quartile = structure(c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = "Today", class = "factor"), 
    min = c(131.8, -47.2, -12.2, 36.1000000000001, 67.3, 27.4999999999999, 
    119, 133, 235.3, 287.6, 303.3), max = c(131.8, -47.2, -12.2, 
    36.1000000000001, 67.3, 27.4999999999999, 119, 133, 235.3, 
    287.6, 303.3)), row.names = c(NA, -11L), class = c("tbl_df", 
"tbl", "data.frame"))

left_join(df,df2, by = Key)
Error in standardise_join_by(by, x_names = x_names, y_names = y_names,  : 
  object 'Key' not found

Is there any way around this issue?

CodePudding user response:

Using base merge. Since df and df2 share Key there's no need to specify it. all=TRUE means, we want all rows of df and all of df2.

res <- merge(df, df2, all=TRUE)

head(res)
#    Key Quartile    min   max
# 1 GC23        1    0.8  70.1
# 2 GC23        2   70.2 102.8
# 3 GC23        3  102.9 124.3
# 4 GC23        4  124.4 342.6
# 5 GC23    Today  131.8 131.8
# 6 GC24        1 -108.1  -0.2

CodePudding user response:

It should be in quotes i.e. a character vector

library(dplyr)
left_join(df,df2, by = "Key")

-output

# A tibble: 44 × 7
   Key   Quartile.x    min.x   max.x Quartile.y min.y max.y
   <fct> <fct>         <dbl>   <dbl> <fct>      <dbl> <dbl>
 1 GC23  1             0.800  70.1   Today      132.  132. 
 2 GC23  2            70.2   103.    Today      132.  132. 
 3 GC23  3           103.    124.    Today      132.  132. 
 4 GC23  4           124.    343.    Today      132.  132. 
 5 GC24  1          -108.     -0.200 Today      -47.2 -47.2
 6 GC24  2            -0.200  63.1   Today      -47.2 -47.2
 7 GC24  3            63.2   124.    Today      -47.2 -47.2
 8 GC24  4           124.    190.    Today      -47.2 -47.2
 9 GC25  1           -71.0     4.80  Today      -12.2 -12.2
10 GC25  2             5.30   67.1   Today      -12.2 -12.2
# … with 34 more rows

Update

If we want to add new rows, use bind_rows

> bind_rows(df, df2)
# A tibble: 55 × 4
   Key   Quartile      min     max
   <fct> <fct>       <dbl>   <dbl>
 1 GC23  1           0.800  70.1  
 2 GC23  2          70.2   103.   
 3 GC23  3         103.    124.   
 4 GC23  4         124.    343.   
 5 GC24  1        -108.     -0.200
 6 GC24  2          -0.200  63.1  
 7 GC24  3          63.2   124.   
 8 GC24  4         124.    190.   
 9 GC25  1         -71.0     4.80 
10 GC25  2           5.30   67.1  
# … with 45 more rows

According to ?left_join

by - A character vector of variables to join by or a join specification created with


In case, if we need to update the min/max based on the new dataset

rows_update(df, df2[-2], by = "Key")
# A tibble: 44 × 4
   Key   Quartile   min   max
   <fct> <fct>    <dbl> <dbl>
 1 GC23  1        132.  132. 
 2 GC23  2        132.  132. 
 3 GC23  3        132.  132. 
 4 GC23  4        132.  132. 
 5 GC24  1        -47.2 -47.2
 6 GC24  2        -47.2 -47.2
 7 GC24  3        -47.2 -47.2
 8 GC24  4        -47.2 -47.2
 9 GC25  1        -12.2 -12.2
10 GC25  2        -12.2 -12.2
# … with 34 more rows
  • Related