How to preserve groups when resampling time series cross-sectional data?-CodePudding

I have data that lists the last names of each legislator who served in legislative sessions from six states during 2009-2018. Here's an abbreviated example:

print(df)

   names1  names2 state chamber year cluster y        dyadID
1   Jones   Smith    AK       L 2010   AKL10 1    JonesSmith
2   Smith   Jones    AK       L 2010   AKL10 0    SmithJones
3   Jones   Munoz    AK       L 2012   AKL12 0    JonesMunoz
4   Munoz  Taylor    AK       L 2012   AKL12 1   MunozTaylor
5 Stewart   Kelly    AK       U 2014   AKU14 1  StewartKelly
6  Parker   Jones    AK       U 2014   AKU14 0   ParkerJones
7  Murphy Wallace    AK       U 2016   AKU16 1 MurphyWallace
8 Wallace    Neal    AK       U 2016   AKU16 0   WallaceNeal

I'm using a simulation method that systematically re-shuffles the order of the names, but I don't want names from different clusters mixed together. For example, Jones and Smith form one potential dyad because they're in the same cluster (AKL10), but I wouldn't want a Jones/Smith dyad to be possible for AKL12 since Smith is not in that cluster.

I can get it easily with the observed dyads, but not all potential observed & unobserved combinations. So far, I've tried nesting the names in lists by cluster but I can't figure out how to maintain those groups in my resampling function. Here's what I've got so far:

cluster <- df$cluster
p1 <- df$names1
p2 <- df$names2
y <- df$y

repfun <- function(x, a, b) {
  x1 <- x
  for (i in 1:length(a)) {
    x1[which(x = =a[i])] <- b[i]
  }
  x1
}

names1 <- split(df$p1, df$cluster)
unames <- lapply(names1, unique)

pnames <- sample(unames, length(unames)) ## incorrect list syntax 
                                         ## but this is the closest
pn1 <- repfun(p1, unames, pnames)
pn2 <- repfun(p2, unames, pnames)
newID <- paste(pn1, pn2, sep = "")

py <- y[match(dyadID, newID)]

Everything I've tried leads to a py vector with way too many zeroes or NA values. Hopefully that makes sense. I've tried pasting the cluster ID to the last names but that's about as far as I've gotten and so far it doesn't work.

The dataset is very large (n = 432770), so efficiency would be beneficial. How could I get closer to what I want?

data

df <- data.frame(names1 = c("Jones", "Smith", "Jones", "Munoz", "Stewart", "Parker", "Murphy", "Wallace"),
                 names2 = c("Smith", "Jones", "Munoz", "Taylor", "Kelly", "Jones", "Wallace", "Neal"),
                 state = c("AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK"),
                 chamber = c("L", "L", "L", "L", "U", "U", "U", "U"),
                 year = c("2010", "2010", "2012", "2012", "2014", "2014", "2016", "2016"),
                 cluster = c("AKL10", "AKL10", "AKL12", "AKL12", "AKU14", "AKU14", "AKU16", "AKU16"),
                  y = c(1, 0, 0, 1, 1, 0, 1, 0))
df$dyadID <- paste(df$names1, df$names2, sep = "")

CodePudding user response：

How about this:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyr)
df <- data.frame(names1 = c("Jones", "Smith", "Jones", "Munoz", "Stewart", "Parker", "Murphy", "Wallace"),
                 names2 = c("Smith", "Jones", "Munoz", "Taylor", "Kelly", "Jones", "Wallace", "Neal"),
                 state = c("AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK"),
                 chamber = c("L", "L", "L", "L", "U", "U", "U", "U"),
                 year = c("2010", "2010", "2012", "2012", "2014", "2014", "2016", "2016"),
                 cluster = c("AKL10", "AKL10", "AKL12", "AKL12", "AKU14", "AKU14", "AKU16", "AKU16"),
                 y = c(1, 0, 0, 1, 1, 0, 1, 0))
df$dyadID <- paste(df$names1, df$names2, sep = "")


make_dyads <- function(name1, name2){
  uname <- unique(c(name1, name2))
  eg <- expand.grid(names1 = uname, names2 = uname)
  eg <- subset(eg, names1 != names2)
  eg$dyadID <- apply(as.matrix(eg), 1, paste, collapse="")
  eg
}

newdf <- df %>% group_by(state, chamber, year, cluster) %>% 
  summarise(dyads = make_dyads(names1, names2)) %>% 
  unnest(dyads) %>% 
  left_join(df %>% select(dyadID, y))
#> `summarise()` has grouped output by 'state', 'chamber', 'year', 'cluster'. You
#> can override using the `.groups` argument.
#> Joining, by = "dyadID"

newdf
#> # A tibble: 26 × 8
#> # Groups:   state, chamber, year, cluster [4]
#>    state chamber year  cluster names1 names2  dyadID            y
#>    <chr> <chr>   <chr> <chr>   <fct>  <fct>   <chr>         <dbl>
#>  1 AK    L       2010  AKL10   Smith  Jones   SmithJones        0
#>  2 AK    L       2010  AKL10   Jones  Smith   JonesSmith        1
#>  3 AK    L       2012  AKL12   Munoz  Jones   MunozJones       NA
#>  4 AK    L       2012  AKL12   Taylor Jones   TaylorJones      NA
#>  5 AK    L       2012  AKL12   Jones  Munoz   JonesMunoz        0
#>  6 AK    L       2012  AKL12   Taylor Munoz   TaylorMunoz      NA
#>  7 AK    L       2012  AKL12   Jones  Taylor  JonesTaylor      NA
#>  8 AK    L       2012  AKL12   Munoz  Taylor  MunozTaylor       1
#>  9 AK    U       2014  AKU14   Parker Stewart ParkerStewart    NA
#> 10 AK    U       2014  AKU14   Kelly  Stewart KellyStewart     NA
#> # … with 16 more rows

^{Created on 2022-04-02 by the reprex package (v2.0.1)}

CodePudding user response：

To sample within clusters you could first split the data frame by cluster and apply the sampling process in the slices. You may use by, which you can think as a combination of split and lapply.

For the sampling process in a cluster, you want to unlist the first two columns 1:2 containing the names. Then you want to sample w/o replacement from all possible permutations of length m=2. RcppAlgos::permuteGeneral will do the job very fast. Then just create newIDs and cbind to a re-shuffled cluster slice. Finally do.call(rbind, ...) the result.

samp_fun <- \(dat) {
  by(dat, dat$cluster, \(x) {
    u <- unique(unname(unlist(x[1:2])))
    perm <- RcppAlgos::permuteGeneral(u, 2)
    s <- perm[sample(nrow(perm), nrow(x)), ]
    newID <- apply(s, 1, paste, collapse='')
    y <- sapply(newID, \(i) x$y[match(i, x$dyadID)])
    cbind(s, x[3:6], y, newID) |> setNames(names(x))
  }) |> c(make.row.names=FALSE) |> do.call(what=rbind)
}

Gives

One simulation gives a shuffled data frame with same dimensions and same nobs per cluster as the initial data.

set.seed(254477)  ## for sake of reproducibility
samp_fun(dat)
#    names1  names2 state chamber year cluster  y        dyadID
# 1   Smith   Jones    AK       L 2010   AKL10  0    SmithJones
# 2   Jones   Smith    AK       L 2010   AKL10  1    JonesSmith
# 3   Jones   Munoz    AK       L 2012   AKL12  0    JonesMunoz
# 4   Munoz  Taylor    AK       L 2012   AKL12  1   MunozTaylor
# 5 Stewart   Kelly    AK       U 2014   AKU14  1  StewartKelly
# 6  Parker Stewart    AK       U 2014   AKU14 NA ParkerStewart
# 7  Murphy    Neal    AK       U 2016   AKU16 NA    MurphyNeal
# 8  Murphy Wallace    AK       U 2016   AKU16  1 MurphyWallace

You may easily put the function into replicate to simulate a list of nsim data sets.

set.seed(254477)
nsim <- 100
sims <- replicate(nsim, samp_fun(dat), simplify=FALSE)

If you want a very large number of simulations, you could do it in parallel e.g. using parLapply of the parallel package.

Note: R >= 4.1 used.

Data:

dat <- structure(list(names1 = c("Jones", "Smith", "Jones", "Munoz", 
"Stewart", "Parker", "Murphy", "Wallace"), names2 = c("Smith", 
"Jones", "Munoz", "Taylor", "Kelly", "Jones", "Wallace", "Neal"
), state = c("AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK"), 
    chamber = c("L", "L", "L", "L", "U", "U", "U", "U"), year = c("2010", 
    "2010", "2012", "2012", "2014", "2014", "2016", "2016"), 
    cluster = c("AKL10", "AKL10", "AKL12", "AKL12", "AKU14", 
    "AKU14", "AKU16", "AKU16"), y = c(1, 0, 0, 1, 1, 0, 1, 0)), class = "data.frame", row.names = c(NA, 
-8L))