I have two dataframes:
dists:
label1 label2 dist sameCol ID1 ID2
193 194 0.7219847 NA N53 <NA>
193 195 0.5996300 FALSE N53 N43
193 196 0.2038451 FALSE N5 N45
194 195 0.2190454 NA <NA> N43
194 196 0.8894645 NA <NA> N45
195 196 0.7910169 TRUE N38 N5
networkDistances:
ID1 ID2 colony value networkDist
N38 N5 10 0.05 1
N36 N5 10 0.03 1
N4 N3 12 10.00 1
N4 N5 12 10.00 1
N4 N15 12 5.00 1
N15 N14 12 5.00 1
I am trying to join them, IF dists$sameCol == TRUE
&& ID1 and ID2 match, then paste the columns from networkDistances (all other rows should be NA), to look like:
label1 label2 dist sameCol ID1 ID2 colony value networkDist
193 194 0.7219847 NA N53 <NA> NA NA NA
193 195 0.5996300 FALSE N53 N43 NA NA NA
193 196 0.2038451 FALSE N5 N45 NA NA NA
194 195 0.2190454 NA <NA> N43 NA NA NA
194 196 0.8894645 NA <NA> N45 NA NA NA
195 196 0.7910169 TRUE N38 N5 10 0.05 1
I have tried these and they are not working, they paste some info into rows where dists$sameCol == FALSE
r <- left_join(dists, networkDistances, by = c("ID1" = "ID1", "ID2" = "ID2"))
r <- left_join(dists, networkDistances, by = c("ID1" = "ID1", "ID2" = "ID2")) %>%
mutate(networkDist = case_when(sameCol %in% T ~ networkDist))
r <-dists %>%
left_join(networkDistances, by = c("ID1","ID2"))%>%
mutate(networkDist = case_when(sameCol== T ~ networkDist))
CodePudding user response:
Prior to merging, add a sameCol
column with all TRUE
values to networkDists
and use it as an additional key:
library(dplyr)
left_join(
dists,
mutate(networkDistances, sameCol = TRUE),
by = c("ID1", "ID2", "sameCol")
)
# A tibble: 6 × 9
label1 label2 dist sameCol ID1 ID2 colony value networkDist
<dbl> <dbl> <dbl> <lgl> <chr> <chr> <dbl> <dbl> <dbl>
1 193 194 0.722 NA N53 <NA> NA NA NA
2 193 195 0.600 FALSE N53 N43 NA NA NA
3 193 196 0.204 FALSE N5 N45 NA NA NA
4 194 195 0.219 NA <NA> N43 NA NA NA
5 194 196 0.889 NA <NA> N45 NA NA NA
6 195 196 0.791 TRUE N38 N5 10 0.05 1
CodePudding user response:
r <- left_join(dists, networkDistances, by = c("ID1", "ID2"))
r[r$sameCol != TRUE | is.na(r$sameCol), c("colony", "value", "networkDist")] <- NA
First line does the join (and for your example, achieves the required output). Second line amends those columns to NA for any non-TRUEsameCol
, including those with NA
.
CodePudding user response:
First check the condition, then do the merge. I added some more rows to the example to make it clear it omits the negative case.
dists
label1 label2 dist sameCol ID1 ID2
1 193 194 0.7219847 NA N53 <NA>
2 193 195 0.5996300 FALSE N53 N43
3 193 196 0.2038451 FALSE N5 N45
4 194 195 0.2190454 NA <NA> N43
5 194 196 0.8894645 NA <NA> N45
6 195 196 0.7910169 TRUE N38 N5
61 195 196 0.7910169 FALSE N38 N5
62 195 196 0.7910169 TRUE N36 N5
611 195 196 0.7910169 FALSE N38 N5
get the subset
dists_flt <- dists[c(with(dists, which(!(ID1 %in% netdist$ID1 & ID2 %in% netdist$ID2))),
with(dists, which((ID1 %in% netdist$ID1 & ID2 %in% netdist$ID2 & sameCol == T)))),]
with base R
merge(dists_flt, netdist, c("ID1", "ID2"), all.x = T)
ID1 ID2 label1 label2 dist sameCol colony value networkDist
1 <NA> N43 194 195 0.2190454 NA NA NA NA
2 <NA> N45 194 196 0.8894645 NA NA NA NA
3 N36 N5 195 196 0.7910169 TRUE 10 0.03 1
4 N38 N5 195 196 0.7910169 TRUE 10 0.05 1
5 N5 N45 193 196 0.2038451 FALSE NA NA NA
6 N53 <NA> 193 194 0.7219847 NA NA NA NA
7 N53 N43 193 195 0.5996300 FALSE NA NA NA
or with dplyr
library(dplyr)
left_join(dists_flt, netdist, c("ID1", "ID2"))
label1 label2 dist sameCol ID1 ID2 colony value networkDist
1 193 194 0.7219847 NA N53 <NA> NA NA NA
2 193 195 0.5996300 FALSE N53 N43 NA NA NA
3 193 196 0.2038451 FALSE N5 N45 NA NA NA
4 194 195 0.2190454 NA <NA> N43 NA NA NA
5 194 196 0.8894645 NA <NA> N45 NA NA NA
6 195 196 0.7910169 TRUE N38 N5 10 0.05 1
7 195 196 0.7910169 TRUE N36 N5 10 0.03 1
Extended data
dists <- structure(list(label1 = c(193L, 193L, 193L, 194L, 194L, 195L,
195L, 195L, 195L), label2 = c(194L, 195L, 196L, 195L, 196L, 196L,
196L, 196L, 196L), dist = c(0.7219847, 0.59963, 0.2038451, 0.2190454,
0.8894645, 0.7910169, 0.7910169, 0.7910169, 0.7910169), sameCol = c(NA,
FALSE, FALSE, NA, NA, TRUE, FALSE, TRUE, FALSE), ID1 = c("N53",
"N53", "N5", "<NA>", "<NA>", "N38", "N38", "N36", "N38"), ID2 = c("<NA>",
"N43", "N45", "N43", "N45", "N5", "N5", "N5", "N5")), row.names = c("1",
"2", "3", "4", "5", "6", "61", "62", "611"), class = "data.frame")
netdist <- structure(list(ID1 = c("N38", "N36", "N4", "N4", "N4", "N15"),
ID2 = c("N5", "N5", "N3", "N5", "N15", "N14"), colony = c(10L,
10L, 12L, 12L, 12L, 12L), value = c(0.05, 0.03, 10, 10, 5,
5), networkDist = c(1L, 1L, 1L, 1L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-6L))