I have a dataframe that is set up as follows:
set.seed(1234)
id <- rep(1:50, each = 3)
stimuli <- rep(c("a", "b", "c"), each = 1, times = 50)
dv_1 <- rnorm(150, mean = 2, sd = 0.7)
dv_2 <- rnorm(150, mean = 4, sd = 1.5)
dv_3 <- rnorm(150, mean = 7.5, sd = 1)
simdat <- data.frame(id, stimuli, dv_1, dv_2, dv_3)
simranks <- t(apply(simdat[,-1], 1, function(x) rank(x, ties.method = "min") ))
colnames(simranks) <- paste(colnames(simranks), "_rank", sep="")
simdat <- data.frame(simdat, simranks)
I then have split the dataframe according to the different types of stimuli, e.g.,
dat_a <- simdat %>%
dplyr::filter(stimuli == "a") %>%
select(id, dv_1_rank, dv_2_rank, dv_3_rank)
Then I would like to perform a bunch of different analyses on the subsetted data:
a_melt <- melt(dat_a, id.vars = c("id"), measure.vars = c("dv_1_rank", "dv_2_rank", "dv_3_rank"))
a_perc <- a_melt %>%
group_by(variable, value) %>%
summarise(count = n()) %>%
mutate(perc = count/sum(count))
ggplot(a_perc, aes(x = variable, y = perc, fill = value))
geom_col(position = "stack")
scale_y_continuous(labels=scales::percent)
How can I write the code so that, rather than copying and pasting the code chunks for stimuli b and stimuli c, it loops over all of them (the "stimuli" column in the original dataset)
CodePudding user response:
Here is how I usually deal with this.
- I split my dataset into chunks gathered in a list then,
- I use
lapply
oupurrr::map
function to apply a function, that does the analysis for one chunk, to each chunck.
Here you could do something like:
analyses <- function(.df){
require(dplyr)
require(ggplot2)
df_melt <- data.table::melt(.df, id.vars = c("id"), measure.vars = c("dv_1_rank", "dv_2_rank", "dv_3_rank"))
df_perc <- df_melt %>%
group_by(variable, value) %>%
summarise(count = n()) %>%
mutate(perc = count/sum(count))
ggplot(df_perc, aes(x = variable, y = perc, fill = value))
geom_col(position = "stack")
scale_y_continuous(labels=scales::percent)
}
lapply(split(simdat, ~ stimuli), analyses)
Note: this is just a proof of concept.
CodePudding user response:
Does this produce your desired output?
library(tidyverse)
simdat <- expand_grid(stimuli = c("a", "b", "c"), id = 1:20) %>%
mutate(
dv_1_rank = floor(runif(nrow(.), 1, 5)),
dv_2_rank = floor(runif(nrow(.), 1, 5)),
dv_3_rank = floor(runif(nrow(.), 1, 5)),
)
a_perc <- simdat %>%
pivot_longer(dv_1_rank:dv_3_rank) %>%
group_by(stimuli, name, value) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(stimuli, name) %>%
mutate(perc = count/sum(count)) %>%
ungroup()
print(a_perc)
#> # A tibble: 36 x 5
#> stimuli name value count perc
#> <chr> <chr> <dbl> <int> <dbl>
#> 1 a dv_1_rank 1 4 0.2
#> 2 a dv_1_rank 2 8 0.4
#> 3 a dv_1_rank 3 5 0.25
#> 4 a dv_1_rank 4 3 0.15
#> 5 a dv_2_rank 1 5 0.25
#> 6 a dv_2_rank 2 6 0.3
#> 7 a dv_2_rank 3 4 0.2
#> 8 a dv_2_rank 4 5 0.25
#> 9 a dv_3_rank 1 3 0.15
#> 10 a dv_3_rank 2 5 0.25
#> # ... with 26 more rows
Created on 2022-03-14 by the reprex package (v2.0.1)