Home > Software engineering >  Dividing non-equal dataframes based on a group condition
Dividing non-equal dataframes based on a group condition

Time:09-05

I have two dataframes, with a similar strucure:

df_I <- structure(list(year = c("2006", "2006", "2006", "2006", "2006", 
"2006", "2006", "2006", "2006"), code = c(0, 1110, 
1120, 1130, 1220, 1230, 1310, 1320, 1330), `1` = c(1L, 
8L, 2L, 2L, 0L, 2L, 0L, 1L, 0L), `2` = c(0L, 10L, 0L, 0L, 
0L, 2L, 1L, 3L, 1L), `3` = c(4L, 2L, 1L, 2L, 0L, 4L, 
0L, 0L, 3L), `4` = c(4L, 6L, 0L, 3L, 1L, 3L, 0L, 0L, 3L), 
    totaal = c(11, 26, 3, 7, 1, 9, 7, 7, 6)), row.names = c(NA, 
-9L), class = c("tbl_df", "tbl", "data.frame"))

# A tibble: 9 × 7
  year   code   `1`   `2`   `3`   `4` totaal
  <chr> <dbl> <int> <int> <int> <int>  <dbl>
1 2006      0     1     0     4     4     11
2 2006   1110     8    10     2     6     26
3 2006   1120     2     0     1     0      3
4 2006   1130     2     0     2     3      7
5 2006   1220     0     0     0     1      1
6 2006   1230     2     2     4     3      9
7 2006   1310     0     1     0     0      7
8 2006   1320     1     3     0     0      7
9 2006   1330     0     1     3     3      6

df_II <- structure(list(year = c("2006", "2006", "2006", "2006", "2006", 
"2006", "2006", "2006", "2006", "2006"), code = c(0, 1110, 
1120, 1130, 1210, 1220, 1230, 1310, 1320, 1330), `1` = c(15806L, 
655L, 105L, 328L, 138L, 452L, 445L, 471L, 672L, 615L), `2` = c(9681L, 
337L, 68L, 215L, 97L, 357L, 366L, 245L, 440L, 360L), `3` = c(10457L, 
221L, 40L, 123L, 65L, 325L, 322L, 151L, 352L, 332L), `4` = c(7109L, 
128L, 5L, 64L, 56L, 256L, 240L, 83L, 274L, 192L), totaal = c(43053, 
1341, 218, 730, 356, 1390, 1373, 950, 1738, 1499)), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"))

# A tibble: 10 × 7
   year   code   `1`   `2`   `3`   `4` totaal
   <chr> <dbl> <int> <int> <int> <int>  <dbl>
 1 2006      0 15806  9681 10457  7109  43053
 2 2006   1110   655   337   221   128   1341
 3 2006   1120   105    68    40     5    218
 4 2006   1130   328   215   123    64    730
 5 2006   1210   138    97    65    56    356
 6 2006   1220   452   357   325   256   1390
 7 2006   1230   445   366   322   240   1373
 8 2006   1310   471   245   151    83    950
 9 2006   1320   672   440   352   274   1738
10 2006   1330   615   360   332   192   1499

I would like to create a new data.frame df_out, which divides df_I by df_II, for columns 1,2,3,4, totaal by year and code. The issue is that not every code is available for each year.

What is the best way to divide this unequal dataframe?

Desired outcome:

# A tibble: 10 × 7
   year       code   `1`   `2`   `3`   `4` totaal
   <chr>     <dbl> <int> <int> <int> <int>  <dbl>
 1 2006       0 1 /15806 0/9681 4/10457  4/7109  11/43053

CodePudding user response:

You could subset the second data frame using %in%, assuming both code columns are properly ordered.

cols <- as.character(1:4)
cbind(df_I[setdiff(names(df_I), cols)], df_I[cols] / subset(df_II, code %in% df_I$code, cols))
#   year code totaal            1           2            3           4
# 1 2006    0     11 6.326711e-05 0.000000000 0.0003825189 0.000562667
# 2 2006 1110     26 1.221374e-02 0.029673591 0.0090497738 0.046875000
# 3 2006 1120      3 1.904762e-02 0.000000000 0.0250000000 0.000000000
# 4 2006 1130      7 6.097561e-03 0.000000000 0.0162601626 0.046875000
# 5 2006 1220      1 0.000000e 00 0.000000000 0.0000000000 0.003906250
# 6 2006 1230      9 4.494382e-03 0.005464481 0.0124223602 0.012500000
# 7 2006 1310      7 0.000000e 00 0.004081633 0.0000000000 0.000000000
# 8 2006 1320      7 1.488095e-03 0.006818182 0.0000000000 0.000000000
# 9 2006 1330      6 0.000000e 00 0.002777778 0.0090361446 0.015625000

CodePudding user response:

You could use complete to make the number of rows between the two data frames equal, and then do the division:

library(tidyr)
df_I %<>% 
  complete(code = df_II$code) %>% 
  fill(year) %>% 
  replace(is.na(.), 0)

cbind(df_I[c(1, 2)], df_I[-c(1, 2)] / df_II[-c(1, 2)])
    code year        `1`     `2`      `3`      `4`   totaal
   <dbl> <chr>     <dbl>   <dbl>    <dbl>    <dbl>    <dbl>
 1     0 2006  0.0000633 0       0.000383 0.000563 0.000255
 2  1110 2006  0.0122    0.0297  0.00905  0.0469   0.0194  
 3  1120 2006  0.0190    0       0.025    0        0.0138  
 4  1130 2006  0.00610   0       0.0163   0.0469   0.00959 
 5  1210 2006  0         0       0        0        0       
 6  1220 2006  0         0       0        0.00391  0.000719
 7  1230 2006  0.00449   0.00546 0.0124   0.0125   0.00655 
 8  1310 2006  0         0.00408 0        0        0.00737 
 9  1320 2006  0.00149   0.00682 0        0        0.00403 
10  1330 2006  0         0.00278 0.00904  0.0156   0.00400 
  •  Tags:  
  • r
  • Related