I have these two datasets (df1
and df2
):
df1<-structure(list(nclusters = c("2", "3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
"31", "32", "33", "34", "35"), score = c(0.977098310323762, 0.353912285832437,
0.099991714420885, 0.0376332612272506, 0.0376332612272506, 0.0991405887047536,
0.0767699856835315, 0.11960646869728, 0.143669700374312, 0.164847860595995,
0.161874525649473, 0.161874525649473, 0.161874525649473, 0.161874525649473,
0.161874525649473, 0.161874525649473, 0.186199471714141, 0.190526715556196,
0.179434847554162, 0.179434847554162, 0.179434847554162, 0.180225650241049,
0.186290449043437, 0.186290449043437, 0.197864663984131, 0.197864663984131,
0.197864663984131, 0.197864663984131, 0.197864663984131, 0.197864663984131,
0.197864663984131, 0.199807305611238, 0.211462882119658, 0.211462882119658
), rank = c(1L, 2L, 30L, 33L, 34L, 31L, 32L, 29L, 28L, 21L, 22L,
23L, 24L, 25L, 26L, 27L, 16L, 13L, 18L, 19L, 20L, 17L, 14L, 15L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 5L, 3L, 4L)), row.names = c(NA,
-34L), class = "data.frame")
> df1
nclusters score rank
1 2 0.97709831 1
2 3 0.35391229 2
3 4 0.09999171 30
4 5 0.03763326 33
5 6 0.03763326 34
6 7 0.09914059 31
7 8 0.07676999 32
8 9 0.11960647 29
9 10 0.14366970 28
10 11 0.16484786 21
11 12 0.16187453 22
12 13 0.16187453 23
13 14 0.16187453 24
14 15 0.16187453 25
15 16 0.16187453 26
16 17 0.16187453 27
17 18 0.18619947 16
18 19 0.19052672 13
19 20 0.17943485 18
20 21 0.17943485 19
21 22 0.17943485 20
22 23 0.18022565 17
23 24 0.18629045 14
24 25 0.18629045 15
25 26 0.19786466 6
26 27 0.19786466 7
27 28 0.19786466 8
28 29 0.19786466 9
29 30 0.19786466 10
30 31 0.19786466 11
31 32 0.19786466 12
32 33 0.19980731 5
33 34 0.21146288 3
34 35 0.21146288 4
df2<-structure(list(nclusters = c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35), Coverage = c(0.792205228889284,
0.912851642353066, 0.255393354214851, 0.255393354214851, 0.255393354214851,
0.353597680650851, 0.333736589639135, 0.412223082792884, 0.480194093041655,
0.566807993260141, 0.552509003210828, 0.552509003210828, 0.552509003210828,
0.552509003210828, 0.552509003210828, 0.552509003210828, 0.701211420528388,
0.737881313741794, 0.651352623359039, 0.651352623359039, 0.651352623359039,
0.656768957603489, 0.701940649590619, 0.701940649590619, 0.81095422062782,
0.81095422062782, 0.81095422062782, 0.81095422062782, 0.81095422062782,
0.81095422062782, 0.81095422062782, 0.833079159600134, 1, 1),
Production = c(1, 0.290313357105995, 0.104438599897452, 0.0395731724045748,
0.0395731724045748, 0.0395731724045748, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715
), Performance = c(0.896102614444642, 0.60158249972953, 0.179915977056152,
0.147483263309713, 0.147483263309713, 0.196585426527713,
0.167158703295883, 0.206401949872757, 0.240387454997143,
0.283694405106386, 0.276544910081729, 0.276544910081729,
0.276544910081729, 0.276544910081729, 0.276544910081729,
0.276544910081729, 0.350896118740509, 0.369231065347212,
0.325966720155835, 0.325966720155835, 0.325966720155835,
0.32867488727806, 0.351260733271625, 0.351260733271625, 0.405767518790225,
0.405767518790225, 0.405767518790225, 0.405767518790225,
0.405767518790225, 0.405767518790225, 0.405767518790225,
0.416829988276383, 0.500290408476315, 0.500290408476315),
Rank = c(1, 2, 31, 34, 33, 30, 32, 29, 28, 21, 27, 26, 25,
24, 23, 22, 16, 13, 20, 19, 18, 17, 15, 14, 12, 11, 10, 9,
8, 7, 6, 5, 4, 3)), class = "data.frame", row.names = c(NA,
-34L))
> df2
nclusters Coverage Production Performance Rank
1 2 0.7922052 1.000000000 0.8961026 1
2 3 0.9128516 0.290313357 0.6015825 2
3 4 0.2553934 0.104438600 0.1799160 31
4 5 0.2553934 0.039573172 0.1474833 34
5 6 0.2553934 0.039573172 0.1474833 33
6 7 0.3535977 0.039573172 0.1965854 30
7 8 0.3337366 0.000580817 0.1671587 32
8 9 0.4122231 0.000580817 0.2064019 29
9 10 0.4801941 0.000580817 0.2403875 28
10 11 0.5668080 0.000580817 0.2836944 21
11 12 0.5525090 0.000580817 0.2765449 27
12 13 0.5525090 0.000580817 0.2765449 26
13 14 0.5525090 0.000580817 0.2765449 25
14 15 0.5525090 0.000580817 0.2765449 24
15 16 0.5525090 0.000580817 0.2765449 23
16 17 0.5525090 0.000580817 0.2765449 22
17 18 0.7012114 0.000580817 0.3508961 16
18 19 0.7378813 0.000580817 0.3692311 13
19 20 0.6513526 0.000580817 0.3259667 20
20 21 0.6513526 0.000580817 0.3259667 19
21 22 0.6513526 0.000580817 0.3259667 18
22 23 0.6567690 0.000580817 0.3286749 17
23 24 0.7019406 0.000580817 0.3512607 15
24 25 0.7019406 0.000580817 0.3512607 14
25 26 0.8109542 0.000580817 0.4057675 12
26 27 0.8109542 0.000580817 0.4057675 11
27 28 0.8109542 0.000580817 0.4057675 10
28 29 0.8109542 0.000580817 0.4057675 9
29 30 0.8109542 0.000580817 0.4057675 8
30 31 0.8109542 0.000580817 0.4057675 7
31 32 0.8109542 0.000580817 0.4057675 6
32 33 0.8330792 0.000580817 0.4168300 5
33 34 1.0000000 0.000580817 0.5002904 4
34 35 1.0000000 0.000580817 0.5002904 3
Now I would like to create a new dataset, which I can call df3
, which uses the rank values of both df1
and df2
. So the output table will look like this:
nclusters df1 df2
2 1 1
3 2 2
4 30 31
5 33 34
6 34 33
7 31 30
8 32 32
9 29 29
10 28 28
11 21 21
12 22 27
13 23 26
14 24 25
15 25 24
16 26 23
17 27 22
18 16 16
19 13 13
20 18 20
21 19 19
22 20 18
23 17 17
24 14 15
25 15 14
26 6 12
27 7 11
28 8 10
29 9 9
30 10 8
31 11 7
32 12 6
33 5 5
34 3 4
35 4 3
CodePudding user response:
Here is another approach:
df1$nclusters <- as.numeric(df1$nclusters) # Convert to numeric as in df2
df3 <- merge(df1[, c(1, 3)], df2[, c(1, 5)], by="nclusters", sort=TRUE)
head(df3)
# nclusters rank Rank
# 1 2 1 1
# 2 3 2 2
# 3 4 30 31
# 4 5 33 34
# 5 6 34 33
# 6 7 31 30
CodePudding user response:
Using merge
and type.convert
. Use by*
argument, if more than one column in both data frames have exactly the same name.
res <- merge(type.convert(df1[-2], as.is=TRUE), df2[c(1, 5)])
head(res)
# nclusters rank Rank
# 1 2 1 1
# 2 3 2 2
# 3 4 30 31
# 4 5 33 34
# 5 6 34 33
# 6 7 31 30
If after that you want to identify rows that differ in both ranks you could do:
## rows
which(with(res, rank - Rank) != 0)
# [1] 3 4 5 6 11 12 13 14 15 16 19 21 23 24 25 26 27 29 30 31 33 34
## clusters
res$nclusters[with(res, rank - Rank) != 0]
# [1] 4 5 6 7 12 13 14 15 16 17 20 22 24 25 26 27 28 30 31 32 34 35
CodePudding user response:
Perhaps this will suit:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df1<-structure(list(nclusters = c("2", "3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
"31", "32", "33", "34", "35"), score = c(0.977098310323762, 0.353912285832437,
0.099991714420885, 0.0376332612272506, 0.0376332612272506, 0.0991405887047536,
0.0767699856835315, 0.11960646869728, 0.143669700374312, 0.164847860595995,
0.161874525649473, 0.161874525649473, 0.161874525649473, 0.161874525649473,
0.161874525649473, 0.161874525649473, 0.186199471714141, 0.190526715556196,
0.179434847554162, 0.179434847554162, 0.179434847554162, 0.180225650241049,
0.186290449043437, 0.186290449043437, 0.197864663984131, 0.197864663984131,
0.197864663984131, 0.197864663984131, 0.197864663984131, 0.197864663984131,
0.197864663984131, 0.199807305611238, 0.211462882119658, 0.211462882119658
), rank = c(1L, 2L, 30L, 33L, 34L, 31L, 32L, 29L, 28L, 21L, 22L,
23L, 24L, 25L, 26L, 27L, 16L, 13L, 18L, 19L, 20L, 17L, 14L, 15L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 5L, 3L, 4L)), row.names = c(NA,
-34L), class = "data.frame")
df2<-structure(list(nclusters = c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35), Coverage = c(0.792205228889284,
0.912851642353066, 0.255393354214851, 0.255393354214851, 0.255393354214851,
0.353597680650851, 0.333736589639135, 0.412223082792884, 0.480194093041655,
0.566807993260141, 0.552509003210828, 0.552509003210828, 0.552509003210828,
0.552509003210828, 0.552509003210828, 0.552509003210828, 0.701211420528388,
0.737881313741794, 0.651352623359039, 0.651352623359039, 0.651352623359039,
0.656768957603489, 0.701940649590619, 0.701940649590619, 0.81095422062782,
0.81095422062782, 0.81095422062782, 0.81095422062782, 0.81095422062782,
0.81095422062782, 0.81095422062782, 0.833079159600134, 1, 1),
Production = c(1, 0.290313357105995, 0.104438599897452, 0.0395731724045748,
0.0395731724045748, 0.0395731724045748, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715,
0.000580816952630715, 0.000580816952630715, 0.000580816952630715
), Performance = c(0.896102614444642, 0.60158249972953, 0.179915977056152,
0.147483263309713, 0.147483263309713, 0.196585426527713,
0.167158703295883, 0.206401949872757, 0.240387454997143,
0.283694405106386, 0.276544910081729, 0.276544910081729,
0.276544910081729, 0.276544910081729, 0.276544910081729,
0.276544910081729, 0.350896118740509, 0.369231065347212,
0.325966720155835, 0.325966720155835, 0.325966720155835,
0.32867488727806, 0.351260733271625, 0.351260733271625, 0.405767518790225,
0.405767518790225, 0.405767518790225, 0.405767518790225,
0.405767518790225, 0.405767518790225, 0.405767518790225,
0.416829988276383, 0.500290408476315, 0.500290408476315),
Rank = c(1, 2, 31, 34, 33, 30, 32, 29, 28, 21, 27, 26, 25,
24, 23, 22, 16, 13, 20, 19, 18, 17, 15, 14, 12, 11, 10, 9,
8, 7, 6, 5, 4, 3)), class = "data.frame", row.names = c(NA,
-34L))
df1$nclusters <- as.numeric(df1$nclusters)
df3 <- dplyr::left_join(df1, df2, by = "nclusters")
df3 %>%
rename("df1" = rank, "df2" = Rank) %>%
select(nclusters, df1, df2)
#> nclusters df1 df2
#> 1 2 1 1
#> 2 3 2 2
#> 3 4 30 31
#> 4 5 33 34
#> 5 6 34 33
#> 6 7 31 30
#> 7 8 32 32
#> 8 9 29 29
#> 9 10 28 28
#> 10 11 21 21
#> 11 12 22 27
#> 12 13 23 26
#> 13 14 24 25
#> 14 15 25 24
#> 15 16 26 23
#> 16 17 27 22
#> 17 18 16 16
#> 18 19 13 13
#> 19 20 18 20
#> 20 21 19 19
#> 21 22 20 18
#> 22 23 17 17
#> 23 24 14 15
#> 24 25 15 14
#> 25 26 6 12
#> 26 27 7 11
#> 27 28 8 10
#> 28 29 9 9
#> 29 30 10 8
#> 30 31 11 7
#> 31 32 12 6
#> 32 33 5 5
#> 33 34 3 4
#> 34 35 4 3
Created on 2022-02-16 by the reprex package (v2.0.1)