Not sure why I am getting this wrong
structure(list(peso1_v00 = structure(c(97.8, 102.9, 78.4, 80.4,
82.5, 92.6, 99, 66.8, 91.8, 70), label = "Peso: 1a determinación", format.spss = "F5.1"),
cintura1_v00 = structure(c(120, 123.2, 104, 106, 117.7, 115,
123, 93.5, 116.7, 109), label = "Cintura: 1a determinación", format.spss = "F5.1"),
tasis2_e_v00 = structure(c(139, 123, 138, 143, 160, 167,
139, 134, 161, 145), label = "TA: tensión arterial 2: sistólica", format.spss = "F4.0"),
tadias2_e_v00 = structure(c(74, 65, 75, 55, 75, 76, 65, 64,
62, 79), label = "TA: tensión arterial 2: diastólica", format.spss = "F4.0"),
p17_total_v00 = structure(c(7, 5, 9, 5, 8, 9, 4, 10, 10,
10), label = "Cuestionario de 17 puntos: Suma de puntuación de P17", format.spss = "F3.0"),
geaf_tot_v00 = structure(c(0, 1286.71, 5524.48, 419.58, 198.14,
839.16, 3188.81, 993.94, 2386.01, 3636.36), label = "AF: Gasto energético en actividad física total (MET•min/sem)", format.spss = "F8.2"),
glucosa_v00 = structure(c(116, 152, 113, 257, 108, 119, 108,
112, 141, 102), label = "Analítica: Glucosa en mg/dL", format.spss = "F4.0"),
albumi_v00 = structure(c(4.57, 4.71, 4.85, 4.43, 4.44, 4.59,
4.6, 4.56, 4.45, 4.59), label = "Analítica: Albúmina en g/dL", format.spss = "F6.2"),
coltot_v00 = structure(c(261, 197, 235, 168, 217, 248, 155,
254, 265, 326), label = "Analítica: Colesterol total en mg/dL", format.spss = "F4.0"),
hdl_v00 = structure(c(71, 43, 38, 36, 57, 49, 54, 60, 48,
60), label = "Analítica: Colesterol HDL en mg/dL", format.spss = "F4.0"),
ldl_calc_v00 = structure(c(147, 135, 161, NA, 145, 155, 73,
157, 172, 222), label = "Analítica: LDL calculado en mg/dL si trigli<=300", format.spss = "F4.0"),
trigli_v00 = structure(c(217, 93, 179, 315, 73, 219, 138,
186, 223, 220), label = "Analítica: Triglicéridos en mg/dL", format.spss = "F5.0"),
hba1c_v00 = structure(c(6.29, NA, 5.63, 12.17, NA, 6.11,
5.9, 5.68, NA, 6.16), label = "Analítica: Hemoglobina glicosilada (HbA1c %)", format.spss = "F5.2"),
i_hucpeptide_v00 = structure(c(854.34, NA, 1485.59, 4241.95,
NA, 847.89, 1524.39, 1265.48, NA, 290.22), label = "Hu C-peptide (72) IMIM S'han substituit en les següents var els codis de inf i sup a limit de detecció per el limit inf i sup de detecció", format.spss = "F9.2", display_width = 13L),
i_hughrelin_v00 = structure(c(681.94, NA, 480.11, 1587.73,
NA, 453, 263.93, 392.98, NA, 1327.91), label = "Hu Ghrelin (26) IMIM", format.spss = "F7.2", display_width = 10L),
i_hugip_v00 = structure(c(2.67, NA, 2.67, 2.67, NA, 2.67,
2.67, 2.67, NA, 2.67), label = "Hu GIP (14) IMIM", format.spss = "F9.2", display_width = 9L),
i_huglp1_v00 = structure(c(177.37, NA, 202.62, 519.02, NA,
200.13, 163.82, 20.29, NA, 14.14), label = "Hu GLP-1 (27) IMIM", format.spss = "F9.2", display_width = 9L),
i_huglucagon_v00 = structure(c(387.58, NA, 591.73, 855.29,
NA, 726.99, 430.35, 389.59, NA, 336.04), label = "Hu Glucagon (15) IMIM", format.spss = "F9.2", display_width = 11L),
i_huinsulin_v00 = structure(c(278.72, NA, 538.29, 1693.25,
NA, 299.75, 608.35, 397.7, NA, 129.17), label = "Hu Insulin (12) IMIM", format.spss = "F7.2", display_width = 10L),
i_huleptin_v00 = structure(c(2518.28, NA, 12175.88, 12369.5,
NA, 8409.76, 5998.71, 9298.52, NA, 5919.57), label = "Hu Leptin (78) IMIM", format.spss = "F9.2", display_width = 9L),
i_hupai1_v00 = structure(c(3084.08, NA, 2650.85, 3202.18,
NA, 3085.25, 3410.73, 3109.79, NA, 1375.07), label = "Hu PAI-1 (61) IMIM", format.spss = "F7.2"),
i_huresistin_v00 = structure(c(4758.94, NA, 3594.11, 13564.63,
NA, 3221.72, 2864.01, 3630.63, NA, 2827.01), label = "Hu Resistin (65) IMIM", format.spss = "F8.2", display_width = 9L),
i_huvisfatin_v00 = structure(c(8.64, NA, 2081.59, 2363.58,
NA, 2989.72, 653.96, 1129.24, NA, 631.11), label = "Hu Visfatin (22) IMIM", format.spss = "F9.2", display_width = 6L),
col_rema_v00 = structure(c(43, 19, 36, NA, 15, 44, 28, 37,
45, 44), format.spss = "F8.2", display_width = 14L), homa_v00 = structure(c(1436.96,
NA, 2703.41, 19340.68, NA, 1585.34, 2920.08, 1979.66, NA,
585.57), format.spss = "F8.2", display_width = 10L), ln_trigli_v00 = structure(c(5.38,
4.53, 5.19, 5.75, 4.29, 5.39, 4.93, 5.23, 5.41, 5.39), format.spss = "F8.2", display_width = 15L),
ln_homa_v00 = structure(c(7.27, NA, 7.9, 9.87, NA, 7.37,
7.98, 7.59, NA, 6.37), format.spss = "F8.2", display_width = 13L),
ln_hba1c_v00 = structure(c(1.84, NA, 1.73, 2.5, NA, 1.81,
1.77, 1.74, NA, 1.82), format.spss = "F8.2", display_width = 14L),
ln_geaf_tot_v00 = structure(c(NA, 7.16, 8.62, 6.04, 5.29,
6.73, 8.07, 6.9, 7.78, 8.2), format.spss = "F8.2", display_width = 17L)), row.names = c(NA,
-10L), class = "data.frame")
What I want to carry out is detection of points outside a certain range
colvars <- c("peso1_v00", "cintura1_v00", "tasis2_e_v00" , "tadias2_e_v00" , "p17_total_v00", "geaf_tot_v00" , "glucosa_v00", "albumi_v00", "coltot_v00", "hdl_v00", "ldl_calc_v00", "trigli_v00", "hba1c_v00", "i_hucpeptide_v00", "i_hughrelin_v00", "i_hugip_v00", "i_huglp1_v00", "i_huglucagon_v00", "i_huinsulin_v00", "i_huleptin_v00", "i_hupai1_v00", "i_huresistin_v00", "i_huvisfatin_v00", "col_rema_v00", "homa_v00", "ln_trigli_v00", "ln_homa_v00", "ln_hba1c_v00", "ln_geaf_tot_v00")
i=1
for (i in 1:length(colvars)) {
q<-sapply(outliers_v00[ ,i], quantile, probs = c(0.25, 0.75), na.rm =T)
iqr<-sapply(outliers_v00[ ,i], IQR, na.rm =T)
up <- q[ , i[2]] 1.5*iqr # Upper Range
low<- q[ ,i[1]]-1.5*iqr # Lower Range
remained<- subset(outliers, outliers[ ,i] > (q[1] - 1.5*iqr) & outliers[ ,i] < (q[2] 1.5*iqr))
eliminated<-subset(outliers_v00, outliers_v00[,i] < (q[1] - 1.5*iqr) & outliers_v00[,i] > (q[2] 1.5*iqr))
}
Even simplifyng the loop till
i=1
for (i in 1:length(colvars)) {
q<-sapply(outliers_v00[ ,i], quantile, probs = c(0.25, 0.75), na.rm =T)
iqr<-sapply(outliers_v00[ ,i], IQR, na.rm =T)
}
I don't get both quantiles, in fact I get the same value (not correspondant with anything) for both Q25 and Q75
Without loop I get the vector with values
q<-sapply(outliers_v00[ ,i], quantile, probs = c(0.25, 0.75), na.rm =T)
But afterwards for the expression remained and eliminates I could not make the iteration work. I know there are alternatives, but I want to learn how to refer to columns [, i]
CodePudding user response:
I would perhaps take this approach - sapply()
taking the outliers_v00 data, it will work through each variable applying the function specified. Here it is a custom function returning the quantiles and IQR as a vector. The output from sapply()
is then transposed with t()
to put it a more intuitive way around, and returned as a data.frame
.
data.frame(t(sapply(outliers_v00, function(j){
c(quantile(j, probs=c(0.25, 0.75), na.rm=TRUE), IQR(j, na.rm=TRUE))
})))
You can then get lists of the values that are inside and outside of the range with this - note it excludes NA values to the excluded list, and a list is appropriate as vector lengths differ (some variables keep all their values, others do not so it is not god for a data.frame)
out <- data.frame(t(sapply(outliers_v00, function(j){
c(quantile(j, probs=c(0.25, 0.75), na.rm=TRUE), IQR(j, na.rm=TRUE))
})))
out$low <- out[,1] - (out[, 3] * 1.5)
out$up <- out[,2] (out[, 3] * 1.5)
remained <- sapply(1:length(outliers_v00), function(j){
outliers_v00[, j][outliers_v00[, j] >= t(out)[4, j] & outliers_v00[, j] <= t(out)[5, j] & !is.na(outliers_v00[, j])]
})
eliminated <- sapply(1:length(outliers_v00), function(j){
outliers_v00[, j][!(outliers_v00[, j] >= t(out)[4, j] & outliers_v00[, j] =< t(out)[5, j])]
})
Side note on the use of T
and F
- it is better practice to use TRUE
and FALSE
- T
and F
can have their meaning overwritten and that could cause havoc, e.g.
T <- FALSE
if(T) print("hello")
CodePudding user response:
It's your usage of sapply which is incorrect, also try another subsetting. Also, you need to assign your loop result to a new list element.
The idea is - either use sapply or lapply, or a for loop. You won't need both. I'd directly loop over the data frame with lapply.
## define list first
my_quantiles <-list()
## you can and should directly loop over the length of your data frame - the columns are the same as your colvars vector. This is much safer.
for (i in 1:length(outliers_v00)) {
## no sapply needed
## subset without a comma.
q <- quantile(outliers_v00[i], probs = c(0.25, 0.75), na.rm =T)
iqr <- diff(unname(q))
up <- q[2] 1.5*iqr # Upper Range
low<- q[1] - 1.5*iqr # Lower Range
## I've slightly simplified your subsetting, also included equal to expression
remained <- outliers_v00[[i]][outliers_v00[[i]] >= low & outliers_v00[[i]] <= up]
## i think you want an "OR" operator here
eliminated<- outliers_v00[[i]][outliers_v00[[i]] < low | outliers_v00[[i]] > up]
## add both vectors to the correct list position
my_quantiles[[i]]<- list(quantiles = q, IQR = iqr, remained = remained, eliminated = eliminated)
}
my_quantiles
#> [[1]]
#> [[1]]$quantiles
#> 25% 75%
#> 78.9 96.5
#> [[1]]$IQR
#> [1] 17.6
#> [[1]]$remained
#> [1] 97.8 102.9 78.4 80.4 82.5 92.6 99.0 66.8 91.8 70.0
#> [[1]]$eliminated
#> numeric(0)
#> [[2]]
#> [[2]]$quantiles
#> 25% 75%
#> 106.750 119.425
...
Better to directly loop over your data frame. Advantage is: Much less code, no subsetting needed, therefore much less likelihood of loop-confusion, and you will have the column names "for free".
lapply(outliers_v00, function(x) {
q <- quantile(x, probs = c(0.25, 0.75), na.rm =T)
iqr <- diff(unname(q))
c(q, IQR = iqr)
## etc
}
)
#> $peso1_v00
#> 25% 75% IQR
#> 78.9 96.5 17.6
#> $cintura1_v00
#> 25% 75% IQR
#> 106.750 119.425 12.675
...