Home > database >  Join specific columns from databases
Join specific columns from databases

Time:02-16

I have these two datasets (df1 and df2):

df1<-structure(list(nclusters = c("2", "3", "4", "5", "6", "7", "8", 
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", 
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", 
"31", "32", "33", "34", "35"), score = c(0.977098310323762, 0.353912285832437, 
0.099991714420885, 0.0376332612272506, 0.0376332612272506, 0.0991405887047536, 
0.0767699856835315, 0.11960646869728, 0.143669700374312, 0.164847860595995, 
0.161874525649473, 0.161874525649473, 0.161874525649473, 0.161874525649473, 
0.161874525649473, 0.161874525649473, 0.186199471714141, 0.190526715556196, 
0.179434847554162, 0.179434847554162, 0.179434847554162, 0.180225650241049, 
0.186290449043437, 0.186290449043437, 0.197864663984131, 0.197864663984131, 
0.197864663984131, 0.197864663984131, 0.197864663984131, 0.197864663984131, 
0.197864663984131, 0.199807305611238, 0.211462882119658, 0.211462882119658
), rank = c(1L, 2L, 30L, 33L, 34L, 31L, 32L, 29L, 28L, 21L, 22L, 
23L, 24L, 25L, 26L, 27L, 16L, 13L, 18L, 19L, 20L, 17L, 14L, 15L, 
6L, 7L, 8L, 9L, 10L, 11L, 12L, 5L, 3L, 4L)), row.names = c(NA, 
-34L), class = "data.frame")

> df1
   nclusters      score rank
1          2 0.97709831    1
2          3 0.35391229    2
3          4 0.09999171   30
4          5 0.03763326   33
5          6 0.03763326   34
6          7 0.09914059   31
7          8 0.07676999   32
8          9 0.11960647   29
9         10 0.14366970   28
10        11 0.16484786   21
11        12 0.16187453   22
12        13 0.16187453   23
13        14 0.16187453   24
14        15 0.16187453   25
15        16 0.16187453   26
16        17 0.16187453   27
17        18 0.18619947   16
18        19 0.19052672   13
19        20 0.17943485   18
20        21 0.17943485   19
21        22 0.17943485   20
22        23 0.18022565   17
23        24 0.18629045   14
24        25 0.18629045   15
25        26 0.19786466    6
26        27 0.19786466    7
27        28 0.19786466    8
28        29 0.19786466    9
29        30 0.19786466   10
30        31 0.19786466   11
31        32 0.19786466   12
32        33 0.19980731    5
33        34 0.21146288    3
34        35 0.21146288    4

df2<-structure(list(nclusters = c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 
28, 29, 30, 31, 32, 33, 34, 35), Coverage = c(0.792205228889284, 
0.912851642353066, 0.255393354214851, 0.255393354214851, 0.255393354214851, 
0.353597680650851, 0.333736589639135, 0.412223082792884, 0.480194093041655, 
0.566807993260141, 0.552509003210828, 0.552509003210828, 0.552509003210828, 
0.552509003210828, 0.552509003210828, 0.552509003210828, 0.701211420528388, 
0.737881313741794, 0.651352623359039, 0.651352623359039, 0.651352623359039, 
0.656768957603489, 0.701940649590619, 0.701940649590619, 0.81095422062782, 
0.81095422062782, 0.81095422062782, 0.81095422062782, 0.81095422062782, 
0.81095422062782, 0.81095422062782, 0.833079159600134, 1, 1), 
    Production = c(1, 0.290313357105995, 0.104438599897452, 0.0395731724045748, 
    0.0395731724045748, 0.0395731724045748, 0.000580816952630715, 
    0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
    0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
    0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
    0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
    0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
    0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
    0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
    0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
    0.000580816952630715, 0.000580816952630715, 0.000580816952630715
    ), Performance = c(0.896102614444642, 0.60158249972953, 0.179915977056152, 
    0.147483263309713, 0.147483263309713, 0.196585426527713, 
    0.167158703295883, 0.206401949872757, 0.240387454997143, 
    0.283694405106386, 0.276544910081729, 0.276544910081729, 
    0.276544910081729, 0.276544910081729, 0.276544910081729, 
    0.276544910081729, 0.350896118740509, 0.369231065347212, 
    0.325966720155835, 0.325966720155835, 0.325966720155835, 
    0.32867488727806, 0.351260733271625, 0.351260733271625, 0.405767518790225, 
    0.405767518790225, 0.405767518790225, 0.405767518790225, 
    0.405767518790225, 0.405767518790225, 0.405767518790225, 
    0.416829988276383, 0.500290408476315, 0.500290408476315), 
    Rank = c(1, 2, 31, 34, 33, 30, 32, 29, 28, 21, 27, 26, 25, 
    24, 23, 22, 16, 13, 20, 19, 18, 17, 15, 14, 12, 11, 10, 9, 
    8, 7, 6, 5, 4, 3)), class = "data.frame", row.names = c(NA, 
-34L))

> df2
   nclusters  Coverage  Production Performance Rank
1          2 0.7922052 1.000000000   0.8961026    1
2          3 0.9128516 0.290313357   0.6015825    2
3          4 0.2553934 0.104438600   0.1799160   31
4          5 0.2553934 0.039573172   0.1474833   34
5          6 0.2553934 0.039573172   0.1474833   33
6          7 0.3535977 0.039573172   0.1965854   30
7          8 0.3337366 0.000580817   0.1671587   32
8          9 0.4122231 0.000580817   0.2064019   29
9         10 0.4801941 0.000580817   0.2403875   28
10        11 0.5668080 0.000580817   0.2836944   21
11        12 0.5525090 0.000580817   0.2765449   27
12        13 0.5525090 0.000580817   0.2765449   26
13        14 0.5525090 0.000580817   0.2765449   25
14        15 0.5525090 0.000580817   0.2765449   24
15        16 0.5525090 0.000580817   0.2765449   23
16        17 0.5525090 0.000580817   0.2765449   22
17        18 0.7012114 0.000580817   0.3508961   16
18        19 0.7378813 0.000580817   0.3692311   13
19        20 0.6513526 0.000580817   0.3259667   20
20        21 0.6513526 0.000580817   0.3259667   19
21        22 0.6513526 0.000580817   0.3259667   18
22        23 0.6567690 0.000580817   0.3286749   17
23        24 0.7019406 0.000580817   0.3512607   15
24        25 0.7019406 0.000580817   0.3512607   14
25        26 0.8109542 0.000580817   0.4057675   12
26        27 0.8109542 0.000580817   0.4057675   11
27        28 0.8109542 0.000580817   0.4057675   10
28        29 0.8109542 0.000580817   0.4057675    9
29        30 0.8109542 0.000580817   0.4057675    8
30        31 0.8109542 0.000580817   0.4057675    7
31        32 0.8109542 0.000580817   0.4057675    6
32        33 0.8330792 0.000580817   0.4168300    5
33        34 1.0000000 0.000580817   0.5002904    4
34        35 1.0000000 0.000580817   0.5002904    3

Now I would like to create a new dataset, which I can call df3, which uses the rank values of both df1 and df2. So the output table will look like this:

 nclusters  df1  df2 
        2    1    1
        3    2    2
        4   30    31
        5   33    34
        6   34    33
        7   31    30
        8   32    32
        9   29    29
        10  28    28
        11  21    21
        12  22    27
        13  23    26
        14  24    25
        15  25    24
        16  26    23
        17  27    22
        18  16    16
        19  13    13
        20  18    20
        21  19    19
        22  20    18
        23  17    17
        24  14    15
        25  15    14
        26   6    12
        27   7    11
        28   8    10
        29   9    9
        30   10   8
        31   11   7
        32   12   6
        33    5   5
        34    3   4
        35    4   3

CodePudding user response:

Here is another approach:

df1$nclusters <- as.numeric(df1$nclusters)  # Convert to numeric as in df2
df3 <- merge(df1[, c(1, 3)], df2[, c(1, 5)], by="nclusters", sort=TRUE)
head(df3)
#   nclusters rank Rank
# 1         2    1    1
# 2         3    2    2
# 3         4   30   31
# 4         5   33   34
# 5         6   34   33
# 6         7   31   30

CodePudding user response:

Using merge and type.convert. Use by* argument, if more than one column in both data frames have exactly the same name.

res <- merge(type.convert(df1[-2], as.is=TRUE), df2[c(1, 5)])
head(res)
#   nclusters rank Rank
# 1         2    1    1
# 2         3    2    2
# 3         4   30   31
# 4         5   33   34
# 5         6   34   33
# 6         7   31   30

If after that you want to identify rows that differ in both ranks you could do:

## rows
which(with(res, rank - Rank) != 0)
# [1]  3  4  5  6 11 12 13 14 15 16 19 21 23 24 25 26 27 29 30 31 33 34

## clusters
res$nclusters[with(res, rank - Rank) != 0]
# [1]  4  5  6  7 12 13 14 15 16 17 20 22 24 25 26 27 28 30 31 32 34 35

CodePudding user response:

Perhaps this will suit:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

df1<-structure(list(nclusters = c("2", "3", "4", "5", "6", "7", "8", 
                                  "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", 
                                  "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", 
                                  "31", "32", "33", "34", "35"), score = c(0.977098310323762, 0.353912285832437, 
                                                                           0.099991714420885, 0.0376332612272506, 0.0376332612272506, 0.0991405887047536, 
                                                                           0.0767699856835315, 0.11960646869728, 0.143669700374312, 0.164847860595995, 
                                                                           0.161874525649473, 0.161874525649473, 0.161874525649473, 0.161874525649473, 
                                                                           0.161874525649473, 0.161874525649473, 0.186199471714141, 0.190526715556196, 
                                                                           0.179434847554162, 0.179434847554162, 0.179434847554162, 0.180225650241049, 
                                                                           0.186290449043437, 0.186290449043437, 0.197864663984131, 0.197864663984131, 
                                                                           0.197864663984131, 0.197864663984131, 0.197864663984131, 0.197864663984131, 
                                                                           0.197864663984131, 0.199807305611238, 0.211462882119658, 0.211462882119658
                                  ), rank = c(1L, 2L, 30L, 33L, 34L, 31L, 32L, 29L, 28L, 21L, 22L, 
                                              23L, 24L, 25L, 26L, 27L, 16L, 13L, 18L, 19L, 20L, 17L, 14L, 15L, 
                                              6L, 7L, 8L, 9L, 10L, 11L, 12L, 5L, 3L, 4L)), row.names = c(NA, 
                                                                                                         -34L), class = "data.frame")

df2<-structure(list(nclusters = c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
                                  12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 
                                  28, 29, 30, 31, 32, 33, 34, 35), Coverage = c(0.792205228889284, 
                                                                                0.912851642353066, 0.255393354214851, 0.255393354214851, 0.255393354214851, 
                                                                                0.353597680650851, 0.333736589639135, 0.412223082792884, 0.480194093041655, 
                                                                                0.566807993260141, 0.552509003210828, 0.552509003210828, 0.552509003210828, 
                                                                                0.552509003210828, 0.552509003210828, 0.552509003210828, 0.701211420528388, 
                                                                                0.737881313741794, 0.651352623359039, 0.651352623359039, 0.651352623359039, 
                                                                                0.656768957603489, 0.701940649590619, 0.701940649590619, 0.81095422062782, 
                                                                                0.81095422062782, 0.81095422062782, 0.81095422062782, 0.81095422062782, 
                                                                                0.81095422062782, 0.81095422062782, 0.833079159600134, 1, 1), 
                    Production = c(1, 0.290313357105995, 0.104438599897452, 0.0395731724045748, 
                                   0.0395731724045748, 0.0395731724045748, 0.000580816952630715, 
                                   0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
                                   0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
                                   0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
                                   0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
                                   0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
                                   0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
                                   0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
                                   0.000580816952630715, 0.000580816952630715, 0.000580816952630715, 
                                   0.000580816952630715, 0.000580816952630715, 0.000580816952630715
                    ), Performance = c(0.896102614444642, 0.60158249972953, 0.179915977056152, 
                                       0.147483263309713, 0.147483263309713, 0.196585426527713, 
                                       0.167158703295883, 0.206401949872757, 0.240387454997143, 
                                       0.283694405106386, 0.276544910081729, 0.276544910081729, 
                                       0.276544910081729, 0.276544910081729, 0.276544910081729, 
                                       0.276544910081729, 0.350896118740509, 0.369231065347212, 
                                       0.325966720155835, 0.325966720155835, 0.325966720155835, 
                                       0.32867488727806, 0.351260733271625, 0.351260733271625, 0.405767518790225, 
                                       0.405767518790225, 0.405767518790225, 0.405767518790225, 
                                       0.405767518790225, 0.405767518790225, 0.405767518790225, 
                                       0.416829988276383, 0.500290408476315, 0.500290408476315), 
                    Rank = c(1, 2, 31, 34, 33, 30, 32, 29, 28, 21, 27, 26, 25, 
                             24, 23, 22, 16, 13, 20, 19, 18, 17, 15, 14, 12, 11, 10, 9, 
                             8, 7, 6, 5, 4, 3)), class = "data.frame", row.names = c(NA, 
                                                                                     -34L))

df1$nclusters <- as.numeric(df1$nclusters)
df3 <- dplyr::left_join(df1, df2, by = "nclusters")
df3 %>%
  rename("df1" = rank, "df2" = Rank) %>%
  select(nclusters, df1, df2)
#>    nclusters df1 df2
#> 1          2   1   1
#> 2          3   2   2
#> 3          4  30  31
#> 4          5  33  34
#> 5          6  34  33
#> 6          7  31  30
#> 7          8  32  32
#> 8          9  29  29
#> 9         10  28  28
#> 10        11  21  21
#> 11        12  22  27
#> 12        13  23  26
#> 13        14  24  25
#> 14        15  25  24
#> 15        16  26  23
#> 16        17  27  22
#> 17        18  16  16
#> 18        19  13  13
#> 19        20  18  20
#> 20        21  19  19
#> 21        22  20  18
#> 22        23  17  17
#> 23        24  14  15
#> 24        25  15  14
#> 25        26   6  12
#> 26        27   7  11
#> 27        28   8  10
#> 28        29   9   9
#> 29        30  10   8
#> 30        31  11   7
#> 31        32  12   6
#> 32        33   5   5
#> 33        34   3   4
#> 34        35   4   3

Created on 2022-02-16 by the reprex package (v2.0.1)

  •  Tags:  
  • r
  • Related