Home > Software engineering >  Group_by data frame then divide numeric columns by row means
Group_by data frame then divide numeric columns by row means

Time:07-26

I have a data frame which looks like the following. I would like to group_by by Group, then calculate the rowMeans of R1 and R2 variables when Position == "position_1" (in the case of A group, the calculated value is 1.935561 and in the case of B Group, the value is 1.30338), followed by dividing R1 and R2 by the calculated rowMeans for each group. I was trying to run the following code but got an error. Does anyone know how to solve this issue with mirror changes to the original code (e.g., no need to transform the data frame to a longer format)? Thanks for any help!

test <- my_df %>% 
  group_by(Group) %>% 
   mutate(across(starts_with("R"), ~ .x/ rowMeans(.x %>% filter(Position == "position_1") %>% dplyr::select(starts_with("R")), na.rm = T)))

The error message:

Error in `mutate_cols()`:
! Problem with `mutate()` input `..1`.
i `..1 = across(...)`.
x no applicable method for 'filter' applied to an object of class "c('double', 'numeric')"
i The error occurred in group 1: Group = "A".

Original df:

> my_df
# A tibble: 18 x 4
   Group Position       R1     R2
   <chr> <chr>       <dbl>  <dbl>
 1 A     position_1 1.94   1.93  
 2 A     position_2 1.92   1.94  
 3 A     position_3 1.50   1.44  
 4 A     position_4 1.63   1.73  
 5 A     position_5 1.37   1.32  
 6 A     position_6 0.324  0.356 
 7 A     position_7 0.144  0.145 
 8 A     position_8 0.107  0.0770
 9 A     position_9 0.0681 0.0618
10 B     position_1 1.42   1.19  
11 B     position_2 1.59   1.47  
12 B     position_3 1.13   1.09  
13 B     position_4 1.45   1.27  
14 B     position_5 1.45   1.24  
15 B     position_6 0.919  0.866 
16 B     position_7 0.601  0.786 
17 B     position_8 0.276  0.553 
18 B     position_9 0.167  0.533 

 structure(list(Group = c("A", "A", "A", "A", "A", "A", "A", "A", 
    "A", "B", "B", "B", "B", "B", "B", "B", "B", "B"), Position = c("position_1", 
    "position_2", "position_3", "position_4", "position_5", "position_6", 
    "position_7", "position_8", "position_9", "position_1", "position_2", 
    "position_3", "position_4", "position_5", "position_6", "position_7", 
    "position_8", "position_9"), R1 = c(1.93682154090573, 1.91856596758703, 
    1.49858450015508, 1.63084469573099, 1.37215468874851, 0.324435564944789, 
    0.143752050868138, 0.106781932917991, 0.0680590581417462, 1.41897401502389, 
    1.58660492722105, 1.13036771612536, 1.44662711984727, 1.45416934674475, 
    0.919179924031544, 0.600901176433165, 0.275888860745681, 0.167286913827297
    ), R2 = c(1.9343002335802, 1.93865578506416, 1.44482785753871, 
    1.72787733301708, 1.31512320353745, 0.355672021297666, 0.144683081118674, 
    0.0770487437127792, 0.061811741133288, 1.18778573693332, 1.47211785392794, 
    1.09160495952408, 1.27155041680802, 1.23843099839242, 0.866062777290239, 
    0.786331686244975, 0.553327119553047, 0.53278845132597)), row.names = c(NA, 
    -18L), class = c("tbl_df", "tbl", "data.frame"))

My final df:

> final_df
# A tibble: 18 x 4
   Group Position       R1     R2
   <chr> <chr>       <dbl>  <dbl>
 1 A     position_1 1.00   0.999 
 2 A     position_2 0.991  1.00  
 3 A     position_3 0.774  0.746 
 4 A     position_4 0.843  0.893 
 5 A     position_5 0.709  0.679 
 6 A     position_6 0.168  0.184 
 7 A     position_7 0.0743 0.0747
 8 A     position_8 0.0552 0.0398
 9 A     position_9 0.0352 0.0319
10 B     position_1 1.09   0.911 
11 B     position_2 1.22   1.13  
12 B     position_3 0.867  0.838 
13 B     position_4 1.11   0.976 
14 B     position_5 1.12   0.950 
15 B     position_6 0.705  0.664 
16 B     position_7 0.461  0.603 
17 B     position_8 0.212  0.425 
18 B     position_9 0.128  0.409 

structure(list(Group = c("A", "A", "A", "A", "A", "A", "A", "A", 
"A", "B", "B", "B", "B", "B", "B", "B", "B", "B"), Position = c("position_1", 
"position_2", "position_3", "position_4", "position_5", "position_6", 
"position_7", "position_8", "position_9", "position_1", "position_2", 
"position_3", "position_4", "position_5", "position_6", "position_7", 
"position_8", "position_9"), R1 = c(1.00065125351551, 0.991219583152912, 
0.774237805036926, 0.842569516399114, 0.708918338790927, 0.167618362296404, 
0.074268933331545, 0.0551684668775569, 0.0351624454831164, 1.08868788459535, 
1.21730034772748, 0.86725875502567, 1.10990434090386, 1.115691008566, 
0.705227887516721, 0.461032988409493, 0.211671853753841, 0.128348535214057
), R2 = c(0.999348629973531, 1.00159890856664, 0.746464646445505, 
0.892701047922063, 0.679453245615846, 0.183756554971745, 0.0747499464592819, 
0.0398069312787244, 0.0319347936506718, 0.911311925097299, 1.1294617486289, 
0.837518574417343, 0.975579199318708, 0.950168790676873, 0.664474502670164, 
0.603301942829393, 0.424532461410369, 0.408774456663421)), row.names = c(NA, 
-18L), class = c("tbl_df", "tbl", "data.frame"))`enter code here`

CodePudding user response:

Is this you want?

final_df = my_df %>% 
           pivot_longer(cols = 3:4) %>% 
           group_by(Group, Position) %>% 
           summarise(mean = mean(value), across()) %>% 
           pivot_wider(names_from = name, values_from = value) %>% 
           mutate(R1 = R1 / mean,
                  R2 = R2 / mean)

CodePudding user response:

Here's a not so elegant solution that corrects my first answer. Hope this helps you in some way!

`final_df = my_df %>% 
           pivot_longer(cols = 3:4) %>% 
           group_by(Group, Position) %>% 
           summarise(mean = mean(value), across()) %>% 
           pivot_wider(names_from = name, values_from = value) %>% 
           mutate(R1 = case_when(Group == "A" ~ (R1 / .$mean[.$Group == "A" & .$Position == "position_1"]),
                                 Group == "B" ~ (R1 / .$mean[.$Group == "B" & .$Position == "position_1"])),
                  R2 = case_when(Group == "A" ~ (R2 / .$mean[.$Group == "A" & .$Position == "position_1"]),
                                 Group == "B" ~ (R2 / .$mean[.$Group == "B" & .$Position == "position_1"])))`

CodePudding user response:

Using the "original df" structure provided above and the data.table library, the solution as follows: to take a subset of the original data based on rows corresponding to the desired "position 1", then find the mean value of R1 and R2, grouping by Group; and then doing an inner join on the two tables using the Group as they key, then creating new output variables of the R1 and R2 columns divided by the means given in position 1 data set. I also created a new position column programmatically, in case this may also be useful.

library(data.table)

x<-structure(list(Group = c("A", "A", "A", "A", "A", "A", "A", "A", 
                         "A", "B", "B", "B", "B", "B", "B", "B", "B", "B"), Position = c("position_1", 
                                                                                         "position_2", "position_3", "position_4", "position_5", "position_6", 
                                                                                         "position_7", "position_8", "position_9", "position_1", "position_2", 
                                                                                         "position_3", "position_4", "position_5", "position_6", "position_7", 
                                                                                         "position_8", "position_9"), R1 = c(1.93682154090573, 1.91856596758703, 
                                                                                                                             1.49858450015508, 1.63084469573099, 1.37215468874851, 0.324435564944789, 
                                                                                                                             0.143752050868138, 0.106781932917991, 0.0680590581417462, 1.41897401502389, 
                                                                                                                             1.58660492722105, 1.13036771612536, 1.44662711984727, 1.45416934674475, 
                                                                                                                             0.919179924031544, 0.600901176433165, 0.275888860745681, 0.167286913827297
                                                                                         ), R2 = c(1.9343002335802, 1.93865578506416, 1.44482785753871, 
                                                                                                   1.72787733301708, 1.31512320353745, 0.355672021297666, 0.144683081118674, 
                                                                                                   0.0770487437127792, 0.061811741133288, 1.18778573693332, 1.47211785392794, 
                                                                                                   1.09160495952408, 1.27155041680802, 1.23843099839242, 0.866062777290239, 
                                                                                                   0.786331686244975, 0.553327119553047, 0.53278845132597)), row.names = c(NA, 
                                                                                                                                                                           -18L), class = c("tbl_df", "tbl", "data.frame"))
setDT(x)


x[,new_position:=1:.N,by=.(Group)]
position1_means <- x[new_position==1,.(Group,R1,R2)]
position1_means[,rowMeans:=mean(c(R1,R2)),by=.(Group)]
final_df <- x[position1_means,
              .(Group,
                new_position,
                R1,
                R2,
                R1_by_p1means=R1/rowMeans,
                R2_by_p1means=R2/rowMeans),
              on=.(Group)]

The resulting joined data.table "new_x" output values correspond to those in the given "final df"

  • Related