Home > OS >  Remove already existing pairs from dataset
Remove already existing pairs from dataset

Time:09-07

I have a dataset containing two columns ("from" & "to") representing interactions between actors. My problem is that each interaction is represented from the perspective of each actor, meaning the pair from A to B is also present as from B to A. Usually, duplicates can be identified and removed via the unique() or distinct() but in this case I did not find a solution. Any suggestions? Here is some of my data:

structure(list(from = c("anti-government protesters", "anti-government protesters", 
"human rights organisations", "human rights organisations", "MDCT supporters", 
"Movement for Democratic Change", "Movement for Democratic Change", 
"Movement for Democratic Change", "Movement for Democratic Change", 
"nurses", "nurses", "opposition", "opposition", "opposition", 
"opposition supporters", "opposition supporters", "opposition supporters", 
"opposition supporters", "opposition supporters", "ordinary citizens", 
"ordinary citizens", "youth activists", "youth activists", "youth activists"
), to = c("Movement for Democratic Change", "opposition supporters", 
"nurses", "ordinary citizens", "opposition supporters", "anti-government protesters", 
"opposition", "opposition supporters", "youth activists", "human rights organisations", 
"ordinary citizens", "Movement for Democratic Change", "opposition supporters", 
"youth activists", "anti-government protesters", "MDCT supporters", 
"Movement for Democratic Change", "opposition", "youth activists", 
"human rights organisations", "nurses", "Movement for Democratic Change", 
"opposition", "opposition supporters"), n = c(1L, 1L, 1L, 1L, 
1L, 1L, 12L, 16L, 12L, 2L, 2L, 5L, 5L, 5L, 1L, 1L, 6L, 3L, 3L, 
1L, 1L, 1L, 1L, 1L)), class = c("grouped_df", "tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -24L), groups = structure(list(
    from = c("anti-government protesters", "anti-government protesters", 
    "human rights organisations", "human rights organisations", 
    "MDCT supporters", "Movement for Democratic Change", "Movement for Democratic Change", 
    "Movement for Democratic Change", "Movement for Democratic Change", 
    "nurses", "nurses", "opposition", "opposition", "opposition", 
    "opposition supporters", "opposition supporters", "opposition supporters", 
    "opposition supporters", "opposition supporters", "ordinary citizens", 
    "ordinary citizens", "youth activists", "youth activists", 
    "youth activists"), to = c("Movement for Democratic Change", 
    "opposition supporters", "nurses", "ordinary citizens", "opposition supporters", 
    "anti-government protesters", "opposition", "opposition supporters", 
    "youth activists", "human rights organisations", "ordinary citizens", 
    "Movement for Democratic Change", "opposition supporters", 
    "youth activists", "anti-government protesters", "MDCT supporters", 
    "Movement for Democratic Change", "opposition", "youth activists", 
    "human rights organisations", "nurses", "Movement for Democratic Change", 
    "opposition", "opposition supporters"), .rows = structure(list(
        1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 
        14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -24L), .drop = TRUE))

CodePudding user response:

You could sort the values of from and to rowwise and put them back to df.

df[1:2] <- t(apply(df[1:2], 1, sort))

Then unique() or distinct() will work to remove duplicated rows.

unique(df[1:2])

# Or with dplyr:
df %>% distinct(from, to)

                             from                             to
1      anti-government protesters Movement for Democratic Change
2      anti-government protesters          opposition supporters
3      human rights organisations                         nurses
4      human rights organisations              ordinary citizens
5                 MDCT supporters          opposition supporters
7  Movement for Democratic Change                     opposition
8  Movement for Democratic Change          opposition supporters
9  Movement for Democratic Change                youth activists
11                         nurses              ordinary citizens
13                     opposition          opposition supporters
14                     opposition                youth activists
19          opposition supporters                youth activists

You could also sum up the values of n by each pair of from and to.

aggregate(n ~ from   to, df, sum)

# Or with dplyr:
df %>% count(from, to, wt = n)

                             from                             to  n
1      anti-government protesters Movement for Democratic Change  2
2      human rights organisations                         nurses  3
3  Movement for Democratic Change                     opposition 17
4      anti-government protesters          opposition supporters  2
5                 MDCT supporters          opposition supporters  2
6  Movement for Democratic Change          opposition supporters 22
7                      opposition          opposition supporters  8
8      human rights organisations              ordinary citizens  2
9                          nurses              ordinary citizens  3
10 Movement for Democratic Change                youth activists 13
11                     opposition                youth activists  6
12          opposition supporters                youth activists  4
  • Related