I am working with the R programming language.
I have the following dataset that contains the heights and weights of people from Canada - using the value of height (cm), I split weight (kg) into bins based on ntiles, and calculated the average value of var2 within each ntile bin:
library(dplyr)
library(gtools)
set.seed(123)
canada = data.frame(height = rnorm(10000,150,10), weight = rnorm(10000,90, 10))
Part_1 = canada %>%
mutate(quants = quantcut(weight, 100),
rank = as.numeric(quants)) %>%
group_by(quants) %>%
mutate(min = min(weight), max = max(weight), count = n(), avg_height = mean(height))
Part_1 = Part_1 %>% distinct(rank, .keep_all = TRUE)
> Part_1
# A tibble: 100 x 8
# Groups: quants [100]
height weight quants rank min max count avg_height
<dbl> <dbl> <fct> <dbl> <dbl> <dbl> <int> <dbl>
1 144. 114. (110.2,113.9] 99 110. 114. 100 150.
2 148. 88.3 (88.12,88.38] 44 88.1 88.4 100 149.
3 166. 99.3 (99.1,99.52] 83 99.1 99.5 100 152.
4 151. 84.3 (84.14,84.44] 29 84.1 84.4 100 150.
For example, I see that there are 100 people between the weight range of 100.2 - 113.9 kg and the average height of these people is 150 cm
Now, suppose I have a similar dataset for people from the USA:
set.seed(124)
usa = data.frame(height = rnorm(10000,150,10), weight = rnorm(10000,90, 10))
My Question: Based on the weight ranges I calculated using the Canada dataset - I want to find out how many people from the USA fall within these Canadian ranges and what is the average weight of the Americans within these Canadian ranges
For example:
- In the Canada dataset, I saw that there are 100 people between the weight range of 100.2 - 113.9 kg and the average height of these people is 150 cm
- How many Americans are between the weight range of 100.2 - 113.9 kg and what is the average height of these Americans?
I know that I can do this manually for each rank:
americans_in_canadian_rank99 = usa %>%
filter(weight > 110.2 & weight < 113.9) %>%
group_by() %>%
summarize(count = n(), avg_height = mean(height))
americans_in_canadian_rank44 = usa %>%
filter(weight > 88.1 & weight < 88.4) %>%
group_by() %>%
summarize(count = n(), avg_height = mean(height))
In the end, I would be looking for something a desired output like this:
# number of rows should be = number of unique ranks
canadian_rank min_weight max_weight canadian_count canadian_avg_height american_count american_avg_height
1 99 110.2 113.9 100 150 116 150
2 44 88.1 88.4 100 149 154 150
Can someone please help me figure out a better way to do this?
Thanks!
CodePudding user response:
With data.table
you can do this:
library(data.table)
library(stringr)
dt1 <- as.data.table(usa)
dt1 <- dt1[, c("min", "max") := weight]
dt2 <- as.data.table(Part_1 %>% select("quants", "rank"))
dt2 <- cbind(dt2[,.(rank)],
setDT(tstrsplit(str_sub(dt2$quants, 2, -2), ",", fixed = TRUE, names = c("min", "max"))))
dt2 <- dt2[, lapply(.SD, as.numeric)]
setkey(dt2, min, max)
dt1 <- dt1[, rank := dt2$rank[foverlaps(dt1, dt2, by.x = c("min", "max"), by.y = c("min", "max"), which = TRUE)$yid]] %>%
select(-c("min", "max"))
EDIT
Totally missed the last part. But if you wish to do that, it should be relatively straightforward from the last point (you could use dplyr
for that if you wish):
dt3 <- rbind(canada %>%
mutate(quants = quantcut(weight, 100),
rank = as.numeric(quants),
country = "Canada") %>%
as.data.table(),
copy(dt1)[, country := "USA"], fill = TRUE)
dt3 <- dt3[,.(count = .N, avg_height = mean(height)), by = c("rank", "country")] %>%
dcast(rank ~ country, value.var = c("count", "avg_height")) %>%
merge(dt2 %>% rename("min_weight" = "min", "max_weight" = "max"), by = c("rank"), all.x = TRUE)
EDIT 2
Alternatively, you could try to do something similar using cut
function without learning anything from data.table
rank_breaks <- Part_1 %>%
mutate(breaks = sub(",.*", "", str_sub(quants, 2)) %>% as.numeric()) %>%
arrange(rank) %>%
pull(breaks)
# Here I change minimum and maximum of groups 1 and 100 to -Inf and Inf respectively.
# If you do not wish to do so, you can disregard it and run `rank_breaks <- c(rank_breaks, max(canada$weight))` instead
rank_breaks[1] <- -Inf
rank_breaks <- c(rank_breaks, Inf)
usa <- usa %>%
mutate(rank = cut(weight, breaks = rank_breaks, labels = c(1:100)))
CodePudding user response:
You can use fuzzyjoin
for this.
library(fuzzyjoin)
# take percentile ranges and join US data
us_by_canadian_quantiles <- Part_1 |>
ungroup() |>
distinct(rank, min, max, height_avg_can = avg_height) |>
fuzzy_full_join(usa, by = c(min = "weight", max = "weight"), match_fun = c(`<`, `>=`))
# get count and height average per bin
us_by_canadian_quantiles |>
group_by(rank) |>
summarize(n_us = n(),
height_avg_us = mean(height),
height_avg_can = first(height_avg_can)
)
#> # A tibble: 101 × 4
#> rank n_us height_avg_us height_avg_can
#> <dbl> <int> <dbl> <dbl>
#> 1 1 114 150. 149.
#> 2 2 119 149. 149.
#> 3 3 94 148. 151.
#> 4 4 104 150. 150.
#> 5 5 115 152. 150.
#> 6 6 88 150. 149.
#> 7 7 86 150. 150.
#> 8 8 86 150. 151.
#> 9 9 102 151. 151.
#> 10 10 81 152. 150.
#> # … with 91 more rows
Note that there are a number of cases in the US frame which fall outside of the Canadian percentile ranges. They are grouped together here with rank
being NA
, but you could also add ranks 0 and 101 if you wanted to distinguish them.
I should note that fuzzyjoin
tends to be much slower than data.table
. But since you have put a bounty on this question despite having already gotten a data.table
solution, this might be more to your liking.