How to add summary statistics of a long data into wide data frame in R-CodePudding

I have 2 data frames like the one below:

Data Frame 1:

Data Frame 2

I am trying to get Median(IQR) for the phenotypes (Age, ACE2, Height and BMI) for Data Frame 1 by each rs_id's genotype (gt) and add it to Data Frame 2 by rs_id and Phenotpe. This is the resultant data frame I am expecting:

I am giving the minimal sample dput here.

Data Frame 1:

structure(list(BiobankID = c(131, 133, 134, 136, 140, 141, 143, 
144, 145, 131, 133, 134, 136, 140, 141, 143, 144, 145, 131, 133, 
134, 136, 140, 141, 143, 144, 145), Age = c(29, 58, 48, 44, 40, 
32, 43, 61, 64, 29, 58, 48, 44, 40, 32, 43, 61, 64, 29, 58, 48, 
44, 40, 32, 43, 61, 64), ACE2Conc.ngml = c(113.49, 52.91, 89.25, 
74.82, 28.5, 97.33, 97.53, 63.72, 56.86, 113.49, 52.91, 89.25, 
74.82, 28.5, 97.33, 97.53, 63.72, 56.86, 113.49, 52.91, 89.25, 
74.82, 28.5, 97.33, 97.53, 63.72, 56.86), Height = c(170, 160.2, 
161.3, 168, 148, 175.2, 167, 174, 163.1, 170, 160.2, 161.3, 168, 
148, 175.2, 167, 174, 163.1, 170, 160.2, 161.3, 168, 148, 175.2, 
167, 174, 163.1), BMI = c(31.8, 30.8, 33.6, 31.8, 20.1, 26.8, 
26, 27.4, 31.3, 31.8, 30.8, 33.6, 31.8, 20.1, 26.8, 26, 27.4, 
31.3, 31.8, 30.8, 33.6, 31.8, 20.1, 26.8, 26, 27.4, 31.3), rs_id = structure(c(9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 11L, 11L, 11L, 11L, 11L, 11L, 
11L, 11L, 11L, 35L, 35L, 35L, 35L, 35L, 35L, 35L, 35L, 35L), .Label = c("rs2278426", 
"rs1122326", "rs10033119", "rs4932373", "rs2071410", "rs41303171", 
"rs4646116", "rs328", "rs12130333", "rs1748197", "rs1748195", 
"rs79566395", "rs55843109", "rs4703642", "rs199347", "rs148110342", 
"rs16909225", "rs6814581", "rs3825041", "rs67904839", "rs11508026", 
"rs368669", "rs673548", "rs2266788", "rs9457", "rs7770437", "rs1046322", 
"rs1997623", "rs964184", "rs6589566", "rs12440118", "rs2160669", 
"rs4375003", "rs114060881", "rs4675468", "rs5754701", "rs12642481"
), class = "factor"), gt = c("1", "2", "2", "1", "1", "2", "1", 
"1", "1", "1", "2", "3", "1", "2", "2", "1", "2", "1", "3", "3", 
"3", "3", "3", "3", "3", "3", "3")), row.names = c(NA, -27L), class = c("tbl_df", 
"tbl", "data.frame"))

Data Frame 2

structure(list(Phenotype = c("ACE2Conc.ngml", "ACE2Conc.ngml", 
"ACE2Conc.ngml", "Age", "Age", "Age", "BMI", "BMI", "BMI", "Height", 
"Height", "Height"), CHR = c(1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 
2L, 1L, 1L, 2L), SNP = c("rs1748195", "rs12130333", "rs4675468", 
"rs1748195", "rs12130333", "rs4675468", "rs1748195", "rs12130333", 
"rs4675468", "rs1748195", "rs12130333", "rs4675468"), BP = c(63049593L, 
63191777L, 205400216L, 63049593L, 63191777L, 205400216L, 63049593L, 
63191777L, 205400216L, 63049593L, 63191777L, 205400216L), A1 = c("G", 
"T", "T", "G", "T", "T", "G", "T", "T", "G", "T", "T"), TEST = c("ADD", 
"ADD", "ADD", "ADD", "ADD", "ADD", "ADD", "ADD", "ADD", "ADD", 
"ADD", "ADD"), NMISS = c(1209L, 1242L, 1247L, 2018L, 2060L, 2066L, 
2016L, 2058L, 2064L, 2016L, 2058L, 2064L), BETA = c(-1.13, -2.189, 
0.5272, 0.08263, 0.7493, -0.419, -0.3865, 0.1747, -0.5264, 0.2913, 
1.334, -0.613), STAT = c(-1.175, -1.515, 0.2892, 0.2338, 1.371, 
-0.6622, -2.081, 0.6047, -1.575, 0.9933, 2.962, -1.169), P = c(0.2402, 
0.1301, 0.7724, 0.8152, 0.1706, 0.5079, 0.0376, 0.5454, 0.1154, 
0.3207, 0.003091, 0.2426)), class = "data.frame", row.names = c(NA, 
-12L))

Can someone please help?

CodePudding user response：

A tidyverse approach:

library(tidyverse)
df1 %>%
#for each id and gt, take the median of each variable:
  group_by(rs_id, gt) %>% 
  summarise(across(c(Age, ACE2Conc.ngml, Height, BMI), median)) %>%
  ungroup() %>% #stop doing operations for each id and gt
#change to a Phenotype | value format, instead of one column for each variable
  pivot_longer(c(Age, ACE2Conc.ngml, Height, BMI), names_to = "Phenotype") %>%
#change to a Phenotype | median_1 | median_2 | median_3
  pivot_wider(values_from = value, names_from = gt, names_prefix = "MEDIAN_") %>%
#merge and arrange the rows orders
  rename(SNP = rs_id) %>%
  merge(df2) %>%
  arrange(Phenotype, SNP)

Result:

          SNP     Phenotype MEDIAN_1 MEDIAN_2 MEDIAN_3 CHR        BP A1 TEST NMISS     BETA    STAT        P
1  rs12130333 ACE2Conc.ngml   69.270   89.250       NA   1  63191777  T  ADD  1242 -2.18900 -1.5150 0.130100
2   rs1748195 ACE2Conc.ngml   86.175   58.315    89.25   1  63049593  G  ADD  1209 -1.13000 -1.1750 0.240200
3   rs4675468 ACE2Conc.ngml       NA       NA    74.82   2 205400216  T  ADD  1247  0.52720  0.2892 0.772400
4  rs12130333           Age   43.500   48.000       NA   1  63191777  T  ADD  2060  0.74930  1.3710 0.170600
5   rs1748195           Age   43.500   49.000    48.00   1  63049593  G  ADD  2018  0.08263  0.2338 0.815200
6   rs4675468           Age       NA       NA    44.00   2 205400216  T  ADD  2066 -0.41900 -0.6622 0.507900
7  rs12130333           BMI   29.350   30.800       NA   1  63191777  T  ADD  2058  0.17470  0.6047 0.545400
8   rs1748195           BMI   31.550   27.100    33.60   1  63049593  G  ADD  2016 -0.38650 -2.0810 0.037600
9   rs4675468           BMI       NA       NA    30.80   2 205400216  T  ADD  2064 -0.52640 -1.5750 0.115400
10 rs12130333        Height  167.500  161.300       NA   1  63191777  T  ADD  2058  1.33400  2.9620 0.003091
11  rs1748195        Height  167.500  167.100   161.30   1  63049593  G  ADD  2016  0.29130  0.9933 0.320700
12  rs4675468        Height       NA       NA   167.00   2 205400216  T  ADD  2064 -0.61300 -1.1690 0.242600

CodePudding user response：

Not a real answwer, but may me a helpful hint: I've noticed that you work with data.frames which are also tibble so you might be a tidyverse-inhabitant. Did you consider using tidyr::gather resp. tidy::spread? and dplyr::summarize https://dplyr.tidyverse.org/reference/summarise.html e.g. useful functions ... IQR() ...