I want to sum rows based on two different categorical columns. my data is like this :
df <- data.frame(country = c('US', 'US', 'US', 'UK', 'UK', 'UK', 'UK'),
team = c('A1', 'A2', 'B1', 'B1', 'B2', 'C1', 'C2'),
x1990 = c(4, 7, 8, 8, 8, 9, 12),
x2005 = c(3, 3, 4, 4, 6, 7, 7))
and I want to prepare it like this:
df_sum <- data.frame(country = c('US', 'US', 'UK','UK'),
team = c('A', 'B','B', 'C'),
x1990 = c(11, 8,16,21),
x2005 = c(6, 4,10, 14))
CodePudding user response:
Not sure if this is exactly what you're after, as some of the information in "Team" is lost (the digits are ignored), but this provides your expected output with your example input:
df <- data.frame(country = c('US', 'US', 'US', 'UK', 'UK', 'UK', 'UK'),
team = c('A1', 'A2', 'B1', 'B1', 'B2', 'C1', 'C2'),
x1990 = c(4, 7, 8, 8, 8, 9, 12),
x2005 = c(3, 3, 4, 4, 6, 7, 7))
df
#> country team x1990 x2005
#> 1 US A1 4 3
#> 2 US A2 7 3
#> 3 US B1 8 4
#> 4 UK B1 8 4
#> 5 UK B2 8 6
#> 6 UK C1 9 7
#> 7 UK C2 12 7
df_sum <- data.frame(country = c('US', 'US', 'UK','UK'),
team = c('A', 'B','B', 'C'),
x1990 = c(11, 8,16,21),
x2005 = c(6, 4,10, 14))
df_sum
#> country team x1990 x2005
#> 1 US A 11 6
#> 2 US B 8 4
#> 3 UK B 16 10
#> 4 UK C 21 14
aggregate(df[,3:4], by = list(df$country, gsub("\\d ", "", df$team)), FUN = sum)
#> Group.1 Group.2 x1990 x2005
#> 1 US A 11 6
#> 2 UK B 16 10
#> 3 US B 8 4
#> 4 UK C 21 14
Created on 2022-06-07 by the reprex package (v2.0.1)
Edit
You also probably want to sort the 'new' dataframe and relabel the columns:
df2 <- aggregate(df[,3:4], by = list(df$country, gsub("\\d ", "", df$team)), FUN = sum)
colnames(df2) <- c("Country", "Team", "x1990", "x2005")
df2[order(df2$Country, decreasing = TRUE),]
#> Country Team x1990 x2005
#> 1 US A 11 6
#> 3 US B 8 4
#> 2 UK B 16 10
#> 4 UK C 21 14
Created on 2022-06-07 by the reprex package (v2.0.1)
CodePudding user response:
You may try using dplyr
like
library(dplyr)
df %>%
mutate(team = sub("^([[:alpha:]]*).*", "\\1", team)) %>%
group_by(country, team) %>%
summarize(across(where(is.numeric), sum)) %>%
arrange(desc(country))
country team x1990 x2005
<chr> <chr> <dbl> <dbl>
1 US A 11 6
2 US B 8 4
3 UK B 16 10
4 UK C 21 14
CodePudding user response:
You can use
Data
df <- data.frame(country = c('US', 'US', 'US', 'UK', 'UK', 'UK', 'UK'),
team = c('A1', 'A2', 'B1', 'B1', 'B2', 'C1', 'C2'),
x1990 = c(4, 7, 8, 8, 8, 9, 12),
x2005 = c(3, 3, 4, 4, 6, 7, 7))
library(dplyr)
df %>% mutate(teams = sub("[0-9]" , "" , team)) %>%
group_by(country,teams) %>% summarise(x1990 = sum(x1990) , x2005 = sum(x2005)) %>%
arrange(desc(country))
# A tibble: 4 × 4
# Groups: country [2]
country teams x1990 x2005
<chr> <chr> <dbl> <dbl>
1 US A 11 6
2 US B 8 4
3 UK B 16 10
4 UK C 21 14