I have data that lists the last names of each legislator who served in legislative sessions from six states during 2009-2018. Here's an abbreviated example:
print(df)
names1 names2 state chamber year cluster y dyadID
1 Jones Smith AK L 2010 AKL10 1 JonesSmith
2 Smith Jones AK L 2010 AKL10 0 SmithJones
3 Jones Munoz AK L 2012 AKL12 0 JonesMunoz
4 Munoz Taylor AK L 2012 AKL12 1 MunozTaylor
5 Stewart Kelly AK U 2014 AKU14 1 StewartKelly
6 Parker Jones AK U 2014 AKU14 0 ParkerJones
7 Murphy Wallace AK U 2016 AKU16 1 MurphyWallace
8 Wallace Neal AK U 2016 AKU16 0 WallaceNeal
I'm using a simulation method that systematically re-shuffles the order of the names, but I don't want names from different clusters mixed together. For example, Jones and Smith form one potential dyad because they're in the same cluster (AKL10), but I wouldn't want a Jones/Smith dyad to be possible for AKL12 since Smith is not in that cluster.
I can get it easily with the observed dyads, but not all potential observed & unobserved combinations. So far, I've tried nesting the names in lists by cluster but I can't figure out how to maintain those groups in my resampling function. Here's what I've got so far:
cluster <- df$cluster
p1 <- df$names1
p2 <- df$names2
y <- df$y
repfun <- function(x, a, b) {
x1 <- x
for (i in 1:length(a)) {
x1[which(x = =a[i])] <- b[i]
}
x1
}
names1 <- split(df$p1, df$cluster)
unames <- lapply(names1, unique)
pnames <- sample(unames, length(unames)) ## incorrect list syntax
## but this is the closest
pn1 <- repfun(p1, unames, pnames)
pn2 <- repfun(p2, unames, pnames)
newID <- paste(pn1, pn2, sep = "")
py <- y[match(dyadID, newID)]
Everything I've tried leads to a py
vector with way too many zeroes or NA values. Hopefully that makes sense. I've tried pasting the cluster ID to the last names but that's about as far as I've gotten and so far it doesn't work.
The dataset is very large (n = 432770), so efficiency would be beneficial. How could I get closer to what I want?
data
df <- data.frame(names1 = c("Jones", "Smith", "Jones", "Munoz", "Stewart", "Parker", "Murphy", "Wallace"),
names2 = c("Smith", "Jones", "Munoz", "Taylor", "Kelly", "Jones", "Wallace", "Neal"),
state = c("AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK"),
chamber = c("L", "L", "L", "L", "U", "U", "U", "U"),
year = c("2010", "2010", "2012", "2012", "2014", "2014", "2016", "2016"),
cluster = c("AKL10", "AKL10", "AKL12", "AKL12", "AKU14", "AKU14", "AKU16", "AKU16"),
y = c(1, 0, 0, 1, 1, 0, 1, 0))
df$dyadID <- paste(df$names1, df$names2, sep = "")
CodePudding user response:
How about this:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
df <- data.frame(names1 = c("Jones", "Smith", "Jones", "Munoz", "Stewart", "Parker", "Murphy", "Wallace"),
names2 = c("Smith", "Jones", "Munoz", "Taylor", "Kelly", "Jones", "Wallace", "Neal"),
state = c("AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK"),
chamber = c("L", "L", "L", "L", "U", "U", "U", "U"),
year = c("2010", "2010", "2012", "2012", "2014", "2014", "2016", "2016"),
cluster = c("AKL10", "AKL10", "AKL12", "AKL12", "AKU14", "AKU14", "AKU16", "AKU16"),
y = c(1, 0, 0, 1, 1, 0, 1, 0))
df$dyadID <- paste(df$names1, df$names2, sep = "")
make_dyads <- function(name1, name2){
uname <- unique(c(name1, name2))
eg <- expand.grid(names1 = uname, names2 = uname)
eg <- subset(eg, names1 != names2)
eg$dyadID <- apply(as.matrix(eg), 1, paste, collapse="")
eg
}
newdf <- df %>% group_by(state, chamber, year, cluster) %>%
summarise(dyads = make_dyads(names1, names2)) %>%
unnest(dyads) %>%
left_join(df %>% select(dyadID, y))
#> `summarise()` has grouped output by 'state', 'chamber', 'year', 'cluster'. You
#> can override using the `.groups` argument.
#> Joining, by = "dyadID"
newdf
#> # A tibble: 26 × 8
#> # Groups: state, chamber, year, cluster [4]
#> state chamber year cluster names1 names2 dyadID y
#> <chr> <chr> <chr> <chr> <fct> <fct> <chr> <dbl>
#> 1 AK L 2010 AKL10 Smith Jones SmithJones 0
#> 2 AK L 2010 AKL10 Jones Smith JonesSmith 1
#> 3 AK L 2012 AKL12 Munoz Jones MunozJones NA
#> 4 AK L 2012 AKL12 Taylor Jones TaylorJones NA
#> 5 AK L 2012 AKL12 Jones Munoz JonesMunoz 0
#> 6 AK L 2012 AKL12 Taylor Munoz TaylorMunoz NA
#> 7 AK L 2012 AKL12 Jones Taylor JonesTaylor NA
#> 8 AK L 2012 AKL12 Munoz Taylor MunozTaylor 1
#> 9 AK U 2014 AKU14 Parker Stewart ParkerStewart NA
#> 10 AK U 2014 AKU14 Kelly Stewart KellyStewart NA
#> # … with 16 more rows
Created on 2022-04-02 by the reprex package (v2.0.1)
CodePudding user response:
To sample within clusters you could first split
the data frame by cluster and apply the sampling process in the slices. You may use by
, which you can think as a combination of split
and lapply
.
For the sampling process in a cluster, you want to unlist
the first two columns 1:2
containing the names. Then you want to sample
w/o replacement from all possible permutations of length m=2
. RcppAlgos::permuteGeneral
will do the job very fast. Then just create newID
s and cbind
to a re-shuffled cluster slice. Finally do.call(rbind, ...)
the result.
samp_fun <- \(dat) {
by(dat, dat$cluster, \(x) {
u <- unique(unname(unlist(x[1:2])))
perm <- RcppAlgos::permuteGeneral(u, 2)
s <- perm[sample(nrow(perm), nrow(x)), ]
newID <- apply(s, 1, paste, collapse='')
y <- sapply(newID, \(i) x$y[match(i, x$dyadID)])
cbind(s, x[3:6], y, newID) |> setNames(names(x))
}) |> c(make.row.names=FALSE) |> do.call(what=rbind)
}
Gives
One simulation gives a shuffled data frame with same dimensions and same nobs per cluster as the initial data.
set.seed(254477) ## for sake of reproducibility
samp_fun(dat)
# names1 names2 state chamber year cluster y dyadID
# 1 Smith Jones AK L 2010 AKL10 0 SmithJones
# 2 Jones Smith AK L 2010 AKL10 1 JonesSmith
# 3 Jones Munoz AK L 2012 AKL12 0 JonesMunoz
# 4 Munoz Taylor AK L 2012 AKL12 1 MunozTaylor
# 5 Stewart Kelly AK U 2014 AKU14 1 StewartKelly
# 6 Parker Stewart AK U 2014 AKU14 NA ParkerStewart
# 7 Murphy Neal AK U 2016 AKU16 NA MurphyNeal
# 8 Murphy Wallace AK U 2016 AKU16 1 MurphyWallace
You may easily put the function into replicate
to simulate a list of nsim
data sets.
set.seed(254477)
nsim <- 100
sims <- replicate(nsim, samp_fun(dat), simplify=FALSE)
If you want a very large number of simulations, you could do it in parallel e.g. using parLapply
of the parallel
package.
Note: R >= 4.1 used.
Data:
dat <- structure(list(names1 = c("Jones", "Smith", "Jones", "Munoz",
"Stewart", "Parker", "Murphy", "Wallace"), names2 = c("Smith",
"Jones", "Munoz", "Taylor", "Kelly", "Jones", "Wallace", "Neal"
), state = c("AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK"),
chamber = c("L", "L", "L", "L", "U", "U", "U", "U"), year = c("2010",
"2010", "2012", "2012", "2014", "2014", "2016", "2016"),
cluster = c("AKL10", "AKL10", "AKL12", "AKL12", "AKU14",
"AKU14", "AKU16", "AKU16"), y = c(1, 0, 0, 1, 1, 0, 1, 0)), class = "data.frame", row.names = c(NA,
-8L))