Loop over data frame to get quantiles and identify outliers-CodePudding

Not sure why I am getting this wrong

structure(list(peso1_v00 = structure(c(97.8, 102.9, 78.4, 80.4, 
82.5, 92.6, 99, 66.8, 91.8, 70), label = "Peso: 1a determinación", format.spss = "F5.1"), 
    cintura1_v00 = structure(c(120, 123.2, 104, 106, 117.7, 115, 
    123, 93.5, 116.7, 109), label = "Cintura: 1a determinación", format.spss = "F5.1"), 
    tasis2_e_v00 = structure(c(139, 123, 138, 143, 160, 167, 
    139, 134, 161, 145), label = "TA: tensión arterial 2: sistólica", format.spss = "F4.0"), 
    tadias2_e_v00 = structure(c(74, 65, 75, 55, 75, 76, 65, 64, 
    62, 79), label = "TA: tensión arterial 2: diastólica", format.spss = "F4.0"), 
    p17_total_v00 = structure(c(7, 5, 9, 5, 8, 9, 4, 10, 10, 
    10), label = "Cuestionario de 17 puntos: Suma de puntuación de P17", format.spss = "F3.0"), 
    geaf_tot_v00 = structure(c(0, 1286.71, 5524.48, 419.58, 198.14, 
    839.16, 3188.81, 993.94, 2386.01, 3636.36), label = "AF: Gasto energético en actividad física total (MET•min/sem)", format.spss = "F8.2"), 
    glucosa_v00 = structure(c(116, 152, 113, 257, 108, 119, 108, 
    112, 141, 102), label = "Analítica: Glucosa en mg/dL", format.spss = "F4.0"), 
    albumi_v00 = structure(c(4.57, 4.71, 4.85, 4.43, 4.44, 4.59, 
    4.6, 4.56, 4.45, 4.59), label = "Analítica: Albúmina en g/dL", format.spss = "F6.2"), 
    coltot_v00 = structure(c(261, 197, 235, 168, 217, 248, 155, 
    254, 265, 326), label = "Analítica: Colesterol total en mg/dL", format.spss = "F4.0"), 
    hdl_v00 = structure(c(71, 43, 38, 36, 57, 49, 54, 60, 48, 
    60), label = "Analítica: Colesterol HDL en mg/dL", format.spss = "F4.0"), 
    ldl_calc_v00 = structure(c(147, 135, 161, NA, 145, 155, 73, 
    157, 172, 222), label = "Analítica: LDL calculado en mg/dL si trigli<=300", format.spss = "F4.0"), 
    trigli_v00 = structure(c(217, 93, 179, 315, 73, 219, 138, 
    186, 223, 220), label = "Analítica: Triglicéridos en mg/dL", format.spss = "F5.0"), 
    hba1c_v00 = structure(c(6.29, NA, 5.63, 12.17, NA, 6.11, 
    5.9, 5.68, NA, 6.16), label = "Analítica: Hemoglobina glicosilada (HbA1c %)", format.spss = "F5.2"), 
    i_hucpeptide_v00 = structure(c(854.34, NA, 1485.59, 4241.95, 
    NA, 847.89, 1524.39, 1265.48, NA, 290.22), label = "Hu C-peptide (72) IMIM S'han substituit en les següents var els codis de inf i sup a limit de detecció per el limit inf i sup de detecció", format.spss = "F9.2", display_width = 13L), 
    i_hughrelin_v00 = structure(c(681.94, NA, 480.11, 1587.73, 
    NA, 453, 263.93, 392.98, NA, 1327.91), label = "Hu Ghrelin (26) IMIM", format.spss = "F7.2", display_width = 10L), 
    i_hugip_v00 = structure(c(2.67, NA, 2.67, 2.67, NA, 2.67, 
    2.67, 2.67, NA, 2.67), label = "Hu GIP (14) IMIM", format.spss = "F9.2", display_width = 9L), 
    i_huglp1_v00 = structure(c(177.37, NA, 202.62, 519.02, NA, 
    200.13, 163.82, 20.29, NA, 14.14), label = "Hu GLP-1 (27) IMIM", format.spss = "F9.2", display_width = 9L), 
    i_huglucagon_v00 = structure(c(387.58, NA, 591.73, 855.29, 
    NA, 726.99, 430.35, 389.59, NA, 336.04), label = "Hu Glucagon (15) IMIM", format.spss = "F9.2", display_width = 11L), 
    i_huinsulin_v00 = structure(c(278.72, NA, 538.29, 1693.25, 
    NA, 299.75, 608.35, 397.7, NA, 129.17), label = "Hu Insulin (12) IMIM", format.spss = "F7.2", display_width = 10L), 
    i_huleptin_v00 = structure(c(2518.28, NA, 12175.88, 12369.5, 
    NA, 8409.76, 5998.71, 9298.52, NA, 5919.57), label = "Hu Leptin (78) IMIM", format.spss = "F9.2", display_width = 9L), 
    i_hupai1_v00 = structure(c(3084.08, NA, 2650.85, 3202.18, 
    NA, 3085.25, 3410.73, 3109.79, NA, 1375.07), label = "Hu PAI-1 (61) IMIM", format.spss = "F7.2"), 
    i_huresistin_v00 = structure(c(4758.94, NA, 3594.11, 13564.63, 
    NA, 3221.72, 2864.01, 3630.63, NA, 2827.01), label = "Hu Resistin (65) IMIM", format.spss = "F8.2", display_width = 9L), 
    i_huvisfatin_v00 = structure(c(8.64, NA, 2081.59, 2363.58, 
    NA, 2989.72, 653.96, 1129.24, NA, 631.11), label = "Hu Visfatin (22) IMIM", format.spss = "F9.2", display_width = 6L), 
    col_rema_v00 = structure(c(43, 19, 36, NA, 15, 44, 28, 37, 
    45, 44), format.spss = "F8.2", display_width = 14L), homa_v00 = structure(c(1436.96, 
    NA, 2703.41, 19340.68, NA, 1585.34, 2920.08, 1979.66, NA, 
    585.57), format.spss = "F8.2", display_width = 10L), ln_trigli_v00 = structure(c(5.38, 
    4.53, 5.19, 5.75, 4.29, 5.39, 4.93, 5.23, 5.41, 5.39), format.spss = "F8.2", display_width = 15L), 
    ln_homa_v00 = structure(c(7.27, NA, 7.9, 9.87, NA, 7.37, 
    7.98, 7.59, NA, 6.37), format.spss = "F8.2", display_width = 13L), 
    ln_hba1c_v00 = structure(c(1.84, NA, 1.73, 2.5, NA, 1.81, 
    1.77, 1.74, NA, 1.82), format.spss = "F8.2", display_width = 14L), 
    ln_geaf_tot_v00 = structure(c(NA, 7.16, 8.62, 6.04, 5.29, 
    6.73, 8.07, 6.9, 7.78, 8.2), format.spss = "F8.2", display_width = 17L)), row.names = c(NA, 
-10L), class = "data.frame")

What I want to carry out is detection of points outside a certain range

colvars <- c("peso1_v00", "cintura1_v00", "tasis2_e_v00" , "tadias2_e_v00" ,   "p17_total_v00", "geaf_tot_v00"  ,   "glucosa_v00", "albumi_v00", "coltot_v00", "hdl_v00", "ldl_calc_v00", "trigli_v00", "hba1c_v00", "i_hucpeptide_v00", "i_hughrelin_v00", "i_hugip_v00", "i_huglp1_v00", "i_huglucagon_v00", "i_huinsulin_v00", "i_huleptin_v00", "i_hupai1_v00", "i_huresistin_v00", "i_huvisfatin_v00", "col_rema_v00", "homa_v00",  "ln_trigli_v00", "ln_homa_v00", "ln_hba1c_v00", "ln_geaf_tot_v00")

i=1
for (i in 1:length(colvars)) {
  
  q<-sapply(outliers_v00[ ,i], quantile, probs = c(0.25, 0.75), na.rm =T)
  iqr<-sapply(outliers_v00[ ,i], IQR, na.rm =T)
  up <-  q[ , i[2]] 1.5*iqr # Upper Range  
  low<- q[ ,i[1]]-1.5*iqr # Lower Range
  remained<- subset(outliers, outliers[ ,i] > (q[1] - 1.5*iqr) & outliers[ ,i] < (q[2] 1.5*iqr))
  eliminated<-subset(outliers_v00, outliers_v00[,i] < (q[1] - 1.5*iqr) & outliers_v00[,i] > (q[2] 1.5*iqr))

}

Even simplifyng the loop till

i=1
for (i in 1:length(colvars)) {
  
  q<-sapply(outliers_v00[ ,i], quantile, probs = c(0.25, 0.75), na.rm =T)
  iqr<-sapply(outliers_v00[ ,i], IQR, na.rm =T)
}

I don't get both quantiles, in fact I get the same value (not correspondant with anything) for both Q25 and Q75

Without loop I get the vector with values

  q<-sapply(outliers_v00[ ,i], quantile, probs = c(0.25, 0.75), na.rm =T)

But afterwards for the expression remained and eliminates I could not make the iteration work. I know there are alternatives, but I want to learn how to refer to columns [, i]

CodePudding user response：

I would perhaps take this approach - sapply() taking the outliers_v00 data, it will work through each variable applying the function specified. Here it is a custom function returning the quantiles and IQR as a vector. The output from sapply() is then transposed with t() to put it a more intuitive way around, and returned as a data.frame.

data.frame(t(sapply(outliers_v00, function(j){
  c(quantile(j, probs=c(0.25, 0.75), na.rm=TRUE), IQR(j, na.rm=TRUE))
})))

You can then get lists of the values that are inside and outside of the range with this - note it excludes NA values to the excluded list, and a list is appropriate as vector lengths differ (some variables keep all their values, others do not so it is not god for a data.frame)

out <- data.frame(t(sapply(outliers_v00, function(j){
  c(quantile(j, probs=c(0.25, 0.75), na.rm=TRUE), IQR(j, na.rm=TRUE))
})))

out$low <- out[,1] - (out[, 3] * 1.5)
out$up  <- out[,2]   (out[, 3] * 1.5)

remained <- sapply(1:length(outliers_v00), function(j){
  outliers_v00[, j][outliers_v00[, j] >= t(out)[4, j] & outliers_v00[, j] <= t(out)[5, j] & !is.na(outliers_v00[, j])]
})

eliminated <- sapply(1:length(outliers_v00), function(j){
  outliers_v00[, j][!(outliers_v00[, j] >= t(out)[4, j] & outliers_v00[, j] =< t(out)[5, j])]
})

Side note on the use of T and F - it is better practice to use TRUE and FALSE - T and F can have their meaning overwritten and that could cause havoc, e.g.

T <- FALSE
if(T) print("hello")

CodePudding user response：

It's your usage of sapply which is incorrect, also try another subsetting. Also, you need to assign your loop result to a new list element.

The idea is - either use sapply or lapply, or a for loop. You won't need both. I'd directly loop over the data frame with lapply.

## define list first
my_quantiles <-list()

## you can and should directly loop over the length of your data frame - the columns are the same as your colvars vector. This is much safer. 
for (i in 1:length(outliers_v00)) {
## no sapply needed
## subset without a comma. 
  q <- quantile(outliers_v00[i], probs = c(0.25, 0.75), na.rm =T)
  iqr <- diff(unname(q))

up <-  q[2]   1.5*iqr # Upper Range  
low<- q[1] - 1.5*iqr # Lower Range

## I've slightly simplified your subsetting, also included equal to expression
remained <- outliers_v00[[i]][outliers_v00[[i]] >= low & outliers_v00[[i]] <= up]
## i think you want an "OR" operator here
eliminated<- outliers_v00[[i]][outliers_v00[[i]] < low | outliers_v00[[i]] > up]

## add both vectors to the correct list position
my_quantiles[[i]]<- list(quantiles = q, IQR = iqr, remained = remained, eliminated = eliminated)
}

my_quantiles

#> [[1]]
#> [[1]]$quantiles
#>  25%  75% 
#> 78.9 96.5 

#> [[1]]$IQR
#> [1] 17.6

#> [[1]]$remained
#>  [1]  97.8 102.9  78.4  80.4  82.5  92.6  99.0  66.8  91.8  70.0

#> [[1]]$eliminated
#> numeric(0)


#> [[2]]
#> [[2]]$quantiles
#>     25%     75% 
#> 106.750 119.425 
...

Better to directly loop over your data frame. Advantage is: Much less code, no subsetting needed, therefore much less likelihood of loop-confusion, and you will have the column names "for free".

lapply(outliers_v00, function(x) {
q <- quantile(x, probs = c(0.25, 0.75), na.rm =T)
iqr <- diff(unname(q))
c(q, IQR = iqr)

## etc
}
)

#> $peso1_v00
#>     25%     75%     IQR  
#>    78.9    96.5    17.6 

#> $cintura1_v00
#>     25%     75%     IQR 
#> 106.750 119.425  12.675 
...