Home > Software engineering >  How to delete outliers from a nested dataset when they are truly extreme via an iterative method or
How to delete outliers from a nested dataset when they are truly extreme via an iterative method or

Time:10-21

I'm trying removing outliers from this nested dataset

df_join
# A tibble: 12 x 2
# Groups:   signals [12]
   signals     data             
   <chr>       <list>           
 1 P3FCz       <tibble [75 x 5]>
 2 P3Cz        <tibble [75 x 5]>
 3 P3Pz        <tibble [75 x 5]>
 4 LPPearlyFCz <tibble [75 x 5]>
 5 LPPearlyCz  <tibble [75 x 5]>
 6 LPPearlyPz  <tibble [75 x 5]>
 7 LPP1FCz     <tibble [75 x 5]>
 8 LPP1Cz      <tibble [75 x 5]>
 9 LPP1Pz      <tibble [75 x 5]>
10 LPP2FCz     <tibble [75 x 5]>
11 LPP2Cz      <tibble [75 x 5]>
12 LPP2Pz      <tibble [75 x 5]>

where the entire content of it is the following one:

> dput(head(df_join))
structure(list(signals = c("P3FCz", "P3Cz", "P3Pz", "LPPearlyFCz", 
"LPPearlyCz", "LPPearlyPz"), data = list(structure(list(ID = structure(c(1L, 
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 
6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 
11L, 12L, 12L, 12L, 13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 
16L, 16L, 16L, 17L, 17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 
20L, 20L, 21L, 21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 
24L, 25L, 25L, 25L), .Label = c("01", "04", "06", "07", "08", 
"09", "10", "11", "12", "13", "15", "16", "17", "18", "19", "21", 
"22", "23", "25", "27", "28", "30", "44", "46", "49"), class = "factor"), 
    GR = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), SES = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), 
    COND = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L), .Label = c("NEG-CTR", "NEG-NOC", "NEU-NOC"
    ), class = "factor"), value = c(-11.6312151716924, -11.1438413285935, 
    -3.99591470944713, -0.314155675382471, 0.238885648959708, 
    5.03749946898385, -0.213621915029167, -2.96032491743069, 
    -1.97168681693488, -2.83109425298642, 1.09291198163802, -6.692991645215, 
    4.23849942428043, 2.9898889629932, 3.5510699900835, 9.57481668808606, 
    5.4167795618285, 1.7067607715475, -6.13036076093477, -2.82955734597919, 
    -2.50672211111696, 0.528517585832501, 8.16418133488309, 1.88777321897925, 
    -7.73588468896919, -9.83058052401056, -6.97442700196932, 
    1.27327945355082, 2.11962397764132, 0.524299677616254, -1.83310726842883, 
    0.658810483381172, -0.261373488428192, 4.37524298634374, 
    0.625555654900511, 3.19617639836154, 0.0405517582137798, 
    -3.29357103412113, -0.381435057304614, -5.73445509910268, 
    -6.1129152355645, -2.45744234877604, 2.95352732001065, 0.527721249096473, 
    1.91803490989119, -3.46703346467546, -2.40438419043702, -5.35374408162217, 
    -7.27028665849262, -7.1532211375959, -5.39955520296854, 2.65765002364624, 
    0.372495441513391, 6.24433066412776, 1.85698518142405, -0.564454675803529, 
    -0.068523080368053, -7.04782633579147, -4.52263283590558, 
    -6.62134671432544, 4.56661945182626, 3.05859761335498, 2.02997952225347, 
    -6.10523962206958, -0.521871236969702, -3.97851995684846, 
    -2.61258020387919, -4.13974828699279, -3.9210032516844, -4.63162466544638, 
    -4.36762718685405, -6.71005969834916, -4.22719611676328, 
    -0.229916506217565, -5.69725200870146)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -75L)), structure(list(
    ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 
    4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 
    9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L, 
    13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 
    17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 
    21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 
    25L, 25L), .Label = c("01", "04", "06", "07", "08", "09", 
    "10", "11", "12", "13", "15", "16", "17", "18", "19", "21", 
    "22", "23", "25", "27", "28", "30", "44", "46", "49"), class = "factor"), 
    GR = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), SES = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), 
    COND = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L), .Label = c("NEG-CTR", "NEG-NOC", "NEU-NOC"
    ), class = "factor"), value = c(-5.16524399006139, -5.53112490175437, 
    0.621502123415388, 2.23100741241039, 3.96990710862955, 7.75899775608441, 
    -1.30019374375434, -3.59899040898949, -1.92340529575071, 
    2.19344184533265, 5.87900720863083, -5.92378937757888, 2.44958531767688, 
    3.10043497883256, 1.65779442628225, 13.7118233181713, 6.86178446511352, 
    5.31481098188172, -4.13240668697805, 0.162182285588285, 0.142083484505352, 
    5.42592103255673, 14.5496375672716, 4.52018125654081, -2.40677805475299, 
    -5.3832670295207, -1.55736964635117, 3.48359241788107, 4.23167123533126, 
    2.00051785325202, 1.48755216347718, 2.37269462739372, 1.30346907198835, 
    3.89476490634811, 1.87516303240986, 4.36353100770575, 1.9413417416824, 
    -2.22114447555529, -0.015852062711641, -2.76146409940467, 
    -3.51627712447581, 1.01799377568815, 1.74783962328435, 1.1303870721987, 
    2.16398550183836, -3.31557794753334, -1.83920975041768, -6.06703163736936, 
    -8.1566939611461, -9.23030396302541, -4.35545141573936, 0.906302081219897, 
    0.45401759063429, 3.80236232314171, 4.0336657306528, 2.0185967445137, 
    0.835589319243251, -4.6805488231028, -1.20746167339041, -5.50475999427345, 
    4.96594373869991, 4.1349308440931, 3.00187233307059, -5.61465293602653, 
    0.544596077279702, -5.20450410570445, -0.0325220589039272, 
    -2.28038421035601, -2.01375702882255, -1.6547144697087, -0.619979893871085, 
    -4.48258340054462, -1.42281778522059, 2.62315679073783, -4.13736508533355
    )), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-75L)), structure(list(ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 
3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 
8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 
13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 
17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 21L, 
21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 25L, 25L
), .Label = c("01", "04", "06", "07", "08", "09", "10", "11", 
"12", "13", "15", "16", "17", "18", "19", "21", "22", "23", "25", 
"27", "28", "30", "44", "46", "49"), class = "factor"), GR = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), 
    SES = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), COND = structure(c(1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("NEG-CTR", 
    "NEG-NOC", "NEU-NOC"), class = "factor"), value = c(11.8802266972569, 
    12.1053426662461, 12.955441582096, 15.0981004360619, 15.4046229884164, 
    16.671036999147, 3.13771453335467, -0.0892565159000666, 2.15365554736525, 
    13.6778924406572, 14.3862738306396, 6.86762877785576, 7.47946451329025, 
    8.93405130318593, 8.45962311067909, 23.4166601996042, 15.1868092142896, 
    9.97183712753913, 6.267521071803, 10.142198458411, 10.6320358418368, 
    12.9998037913548, 20.7052065690674, 11.8852179570666, 15.7899796085713, 
    7.50729833890206, 14.3076172484818, 9.93797956768228, 10.7693238464384, 
    5.04681800218272, 5.16656503460515, 7.87875085817396, 2.29899409536951, 
    10.0135486953849, 5.48278706243332, 7.81908431468528, 8.64382513728869, 
    3.35777109534179, 3.47474629234488, 4.35678644331281, 3.47085321062162, 
    6.56231512354717, 4.93825547529124, 7.33985613752315, 6.81966900599588, 
    6.54487921689425, 7.25872117706077, 1.10301223694429, -0.856423579793706, 
    -0.887835692028378, -0.931653372049331, 5.6617683754256, 
    2.29939831067085, 5.1554825066748, 6.59026080217083, 3.0741733363644, 
    1.80359068950898, 1.63892755704177, 3.857933716935, 0.769316188513939, 
    10.7031907391191, 9.53278894637555, 8.01071628743378, 6.04891324234645, 
    11.1964453850602, 3.46633322373091, 14.4393884282958, 11.2339563353478, 
    7.74933708914689, 7.1182095475238, 7.39260082121406, 0.627435381320771, 
    9.15473202689768, 13.6559037433263, 7.14786907480758)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -75L)), structure(list(
    ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 
    4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 
    9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L, 
    13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 
    17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 
    21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 
    25L, 25L), .Label = c("01", "04", "06", "07", "08", "09", 
    "10", "11", "12", "13", "15", "16", "17", "18", "19", "21", 
    "22", "23", "25", "27", "28", "30", "44", "46", "49"), class = "factor"), 
    GR = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), SES = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), 
    COND = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L), .Label = c("NEG-CTR", "NEG-NOC", "NEU-NOC"
    ), class = "factor"), value = c(-11.7785042972793, -9.14927207125904, 
    -7.58190508537766, -4.01515836011381, -6.60165385653499, 
    -2.02861964460179, 4.46729570509601, 2.54036572774646, 2.22923889930115, 
    -0.883620011106743, -2.63569087592267, -2.0629672230873, 
    1.14544537612393, 2.08056674659401, 0.0422658298956365, 13.2986259796748, 
    5.06669915366333, 3.93467692474742, 0.0229069420708053, 4.31923128857779, 
    0.237726051904304, 1.89972383690448, 3.2371880079134, 0.318100791495115, 
    -8.08292381883298, -5.73174008540523, -15.7998485301436, 
    1.75469999857951, 0.677370118816266, -1.8397955509895, 2.55445787016256, 
    -0.380810453692585, 0.62462329496673, 2.61316333850434, 2.68202480583985, 
    1.76690658846479, 0.148635887703097, -0.958853757041888, 
    -3.17305964093897, -7.82526758429289, -6.58557573679886, 
    -4.39207076049089, 2.36752476749952, 0.594715760553033, -0.29794568443312, 
    -4.5365387390683, 0.196832250811775, -2.70852853745588, 0.498995124872827, 
    0.165171574219401, 0.269498974991661, 0.901948386281446, 
    -2.45955661653299, 1.63525170542944, 0.155897732673534, 1.8491735212703, 
    -0.856727109535223, -1.16182571974245, 1.07658425742917, 
    -2.21433585407388, 4.3385479368043, 4.40588599635354, 0.127710423625772, 
    -6.26956613362656, -1.17658595005389, -7.25886366924741, 
    -0.888293709383838, -2.14177059335841, -2.42141595261389, 
    -2.958120275175, -5.1274001953303, -5.32347488769128, -4.41290818553442, 
    -1.21404719262173, -4.23649270310915)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -75L)), structure(list(
    ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 
    4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 
    9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L, 
    13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 
    17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 
    21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 
    25L, 25L), .Label = c("01", "04", "06", "07", "08", "09", 
    "10", "11", "12", "13", "15", "16", "17", "18", "19", "21", 
    "22", "23", "25", "27", "28", "30", "44", "46", "49"), class = "factor"), 
    GR = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), SES = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), 
    COND = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L), .Label = c("NEG-CTR", "NEG-NOC", "NEU-NOC"
    ), class = "factor"), value = c(-5.96429031525769, -5.10918437158799, 
    -2.81732229625975, -1.43557366487622, -3.14872157912645, 
    0.160393685024631, 3.52155765271648, 2.10437989449921, 2.70693992810407, 
    5.49897156207812, 5.81171180245335, -1.37301251388987, -0.434363848460157, 
    2.87987510596148, -1.27152670283348, 17.2093269365993, 7.79412746755931, 
    8.11964589961276, 4.95253363860044, 9.50695673265293, 4.15235381401148, 
    6.1294488368639, 8.01447499455337, 0.783414018677801, -1.24197194087055, 
    -0.487178595894761, -9.79031812534203, 4.22150266269492, 
    4.20139847550095, 0.208005397351335, 4.19096721581768, 0.815283302847055, 
    1.48137456347872, 2.0809543999959, 4.35199943309111, 2.84860039832237, 
    3.05879540677983, 2.11976068962167, -0.269002712326028, -2.77155065610474, 
    -2.59002218694999, 0.17928456999128, 2.24515223348079, 1.88805943988563, 
    -0.0920286086411814, -2.00968595029144, 2.59427260100332, 
    -1.27622011197768, 0.588399071755827, -1.43982473126936, 
    1.96978732491278, -0.338674980283045, -1.86484698930706, 
    -0.0154791822607025, 2.55036185373462, 4.42520405730058, 
    -0.599156247027551, 1.60091251589958, 4.7367320574401, -0.192490723623988, 
    4.8452288234686, 5.71745745981867, 1.02554478706585, -4.5951256708181, 
    1.1704842909792, -7.42770276334892, 3.15655538248828, -0.639830772856786, 
    -0.345116641695513, -0.0391030568720636, -2.61585906518491, 
    -2.71685194532693, -1.7348388034111, 1.00287124847525, -2.4844653851482
    )), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-75L)), structure(list(ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 
3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 
8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 
13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 
17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 21L, 
21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 25L, 25L
), .Label = c("01", "04", "06", "07", "08", "09", "10", "11", 
"12", "13", "15", "16", "17", "18", "19", "21", "22", "23", "25", 
"27", "28", "30", "44", "46", "49"), class = "factor"), GR = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "RP", class = "factor"), 
    SES = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "V", class = "factor"), COND = structure(c(1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("NEG-CTR", 
    "NEG-NOC", "NEU-NOC"), class = "factor"), value = c(8.23981597718437, 
    9.51261484648731, 9.42367409925817, 5.06332653216481, 5.02619159395405, 
    9.07903916629231, 7.56089165217984, 5.49719893790597, 4.91476855238182, 
    13.0320953572069, 10.8414516494484, 5.86927622259489, 3.25309970442897, 
    4.6847880297099, 2.71096740085175, 25.567439566524, 16.3241813617706, 
    13.0990192799703, 11.9200281736866, 14.6901305277101, 9.67397418905514, 
    10.2974302220899, 12.0768070828642, 5.9401530589224, 12.4817579327688, 
    12.419526465857, 1.00612108990875, 9.63063375751153, 10.5631237176538, 
    3.08031473770521, 3.35694102903017, 4.28046277054405, -0.133592200169464, 
    6.9103658689166, 7.64737651416791, 6.75669517393108, 8.5369185279747, 
    7.08645126073423, 4.47409706618326, 4.39617687043259, 3.27924738047746, 
    6.06169418872804, 5.34939694712468, 5.58288092654703, 4.85729686493463, 
    7.38032829587839, 11.7259526759912, 4.95764559864061, 6.24066579989613, 
    3.49843659402445, 4.07498375647916, 3.55732294589389, 1.33918111568512, 
    0.956782967443242, 2.32002496709926, 3.15289777246607, -0.832211906889126, 
    6.39254974438057, 7.0533787627062, 2.97245026797807, 6.23573445580928, 
    7.6052386193207, 2.98791225155534, 3.10850022259445, 8.12060882554471, 
    -0.00459651443883508, 13.5899217198075, 9.93070913311253, 
    8.10285456644801, 5.04464304009428, 2.02262615478956, 1.0510618938653, 
    5.62233873107127, 10.1193593084848, 5.87476640145049)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -75L)))), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), groups = structure(list(
    signals = c("LPPearlyCz", "LPPearlyFCz", "LPPearlyPz", "P3Cz", 
    "P3FCz", "P3Pz"), .rows = structure(list(5L, 4L, 6L, 2L, 
        1L, 3L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), .drop = TRUE))
> 

I've tried to check for the presence of outliers as follows:

outliers_table <- df_join %>%
  unnest() %>% 
  dplyr::select(COND, signals, value) %>% 
  group_by(COND) %>%  #it is the equivalent to use as grouping variable the time
  identify_outliers(value)

That turns That turns

A tibble: 30 x 5
   COND    signals     value is.outlier is.extreme
   <fct>   <chr>       <dbl> <lgl>      <lgl>     
 1 NEG-CTR P3FCz       -11.6 TRUE       FALSE     
 2 NEG-CTR P3Cz         13.7 TRUE       FALSE     
 3 NEG-CTR P3Pz         15.1 TRUE       FALSE     
 4 NEG-CTR P3Pz         13.7 TRUE       FALSE     
 5 NEG-CTR P3Pz         23.4 TRUE       TRUE      
 6 NEG-CTR P3Pz         15.8 TRUE       FALSE     
 7 NEG-CTR P3Pz         14.4 TRUE       FALSE     
 8 NEG-CTR LPPearlyFCz -11.8 TRUE       FALSE     
 9 NEG-CTR LPPearlyCz   17.2 TRUE       FALSE     
10 NEG-CTR LPPearlyPz   25.6 TRUE       TRUE  

If I'm interested in delete all of those values that are TRULY EXTREME, how could do I do by using some iterative function orr some if statment?? Please just consider also other alternative in case it is easier (also to keep on the command I've written by adding another %>% command row) that scripring down a for loop or some other function.

Thanks in advance

CodePudding user response:

All right. Let's do it together step by step. As I understand it, you have serious concerns that in your data (I keep it in the variable df) there are outliers and even extreme values. First, we will extract from your data only one grouped tibble and filter for COND ==" NEG-NOC "

library(tidyverse)
library(rstatix)
library(outliers)

data = df$data[[1]] %>% filter(COND=="NEG-NOC") 

Now let's consider what method of outlier identification we will use. We can use the boxplot function for this.

boxplot.stats(data$value)$out
#[1] 8.164181

This is fine, but it only gives us outliers in vector form. The second way is to use identify_outliers. This gives us a tibble but still only with those lines that have these outlier values.

data %>% identify_outliers(variable = "value")
# # A tibble: 1 x 7
# ID    GR    SES   COND    value is.outlier is.extreme
# <fct> <fct> <fct> <fct>   <dbl> <lgl>      <lgl>     
#   1 11    RP    V     NEG-NOC  8.16 TRUE       FALSE

Well, let's use the outlier function from the outliers package. This can give us a logic vector.

outlier(data$value, opposite = T)
#[1] 8.164181
outlier(data$value, opposite = T, logical = T)
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#[22] FALSE FALSE FALSE FALSE

However, neither of these methods will assist you in deciding what to do with these outliers. Please read this carefully . As you can see, you have three options to choose from: Imputation, Capping, Prediction. Which one will you choose? I chose Capping. So I wrote a tiny function that identifies outliers, extreme values and additionally returns your values after Capping.

fOutCapp = function(data){
  x = data$value
  qnt = quantile(x, probs=c(.25, .75), na.rm = T)
  caps = quantile(x, probs=c(.05, .95), na.rm = T)
  H = 1.5 * IQR(x, na.rm = T)
  He = 3 * IQR(x, na.rm = T)
  is.outlier = (x < (qnt[1] - H)) | (x > (qnt[2]   H))
  x[x < (qnt[1] - H)] <- caps[1]
  x[x > (qnt[2]   H)] <- caps[2]
  data %>% group_by(COND) %>% 
    mutate(
      is.outlier = is.outlier,
      is.extreme = (x < (qnt[1] - He)) | (x > (qnt[2]   He)),
      cap.value = x
    )
}

Let's see if it works

data %>% fOutCapp() %>% filter(is.outlier)
# A tibble: 1 x 8
# ID    GR    SES   COND    value is.outlier is.extreme cap.value
# <fct> <fct> <fct> <fct>   <dbl> <lgl>      <lgl>          <dbl>
#   1 11    RP    V     NEG-NOC  8.16 TRUE       FALSE           4.95
data %>% fOutCapp()
# A tibble: 25 x 8
# ID    GR    SES   COND      value is.outlier is.extreme cap.value
# <fct> <fct> <fct> <fct>     <dbl> <lgl>      <lgl>          <dbl>
#   1 01    RP    V     NEG-NOC -11.1   FALSE      FALSE        -11.1  
# 2 04    RP    V     NEG-NOC   0.239 FALSE      FALSE          0.239
# 3 06    RP    V     NEG-NOC  -2.96  FALSE      FALSE         -2.96 
# 4 07    RP    V     NEG-NOC   1.09  FALSE      FALSE          1.09 
# 5 08    RP    V     NEG-NOC   2.99  FALSE      FALSE          2.99 
# 6 09    RP    V     NEG-NOC   5.42  FALSE      FALSE          5.42 
# 7 10    RP    V     NEG-NOC  -2.83  FALSE      FALSE         -2.83 
# 8 11    RP    V     NEG-NOC   8.16  TRUE       FALSE          4.95 
# 9 12    RP    V     NEG-NOC  -9.83  FALSE      FALSE         -9.83 
# 10 13    RP    V     NEG-NOC   2.12  FALSE      FALSE          2.12 
# ... with 15 more rows

Note, however, that your data inside the variable data is grouped after the variable COND. So let's write one more tiny function that will do our fOutCapp on each of the groups.

fOutCappGroup = function(data) data %>% group_by(COND) %>% 
  group_modify(~fOutCapp(.x))

df$data[[1]] %>% fOutCappGroup()
# # A tibble: 75 x 8
# # Groups:   COND [3]
# COND    ID    GR    SES     value is.outlier is.extreme cap.value
# <fct>   <fct> <fct> <fct>   <dbl> <lgl>      <lgl>          <dbl>
#   1 NEG-CTR 01    RP    V     -11.6   FALSE      FALSE        -11.6  
# 2 NEG-CTR 04    RP    V      -0.314 FALSE      FALSE         -0.314
# 3 NEG-CTR 06    RP    V      -0.214 FALSE      FALSE         -0.214
# 4 NEG-CTR 07    RP    V      -2.83  FALSE      FALSE         -2.83 
# 5 NEG-CTR 08    RP    V       4.24  FALSE      FALSE          4.24 
# 6 NEG-CTR 09    RP    V       9.57  FALSE      FALSE          9.57 
# 7 NEG-CTR 10    RP    V      -6.13  FALSE      FALSE         -6.13 
# 8 NEG-CTR 11    RP    V       0.529 FALSE      FALSE          0.529
# 9 NEG-CTR 12    RP    V      -7.74  FALSE      FALSE         -7.74 
# 10 NEG-CTR 13    RP    V       1.27  FALSE      FALSE          1.27 
# # ... with 65 more rows

Bingo. Everything works great. Now we only needs to do one simple mutation.

df %>% group_by(signals) %>% 
  mutate(data = map(data, ~fOutCappGroup(.x))) %>% 
  unnest(data)

output

# A tibble: 450 x 9
# Groups:   signals [6]
   signals COND    ID    GR    SES     value is.outlier is.extreme cap.value
   <chr>   <fct>   <fct> <fct> <fct>   <dbl> <lgl>      <lgl>          <dbl>
 1 P3FCz   NEG-CTR 01    RP    V     -11.6   FALSE      FALSE        -11.6  
 2 P3FCz   NEG-CTR 04    RP    V      -0.314 FALSE      FALSE         -0.314
 3 P3FCz   NEG-CTR 06    RP    V      -0.214 FALSE      FALSE         -0.214
 4 P3FCz   NEG-CTR 07    RP    V      -2.83  FALSE      FALSE         -2.83 
 5 P3FCz   NEG-CTR 08    RP    V       4.24  FALSE      FALSE          4.24 
 6 P3FCz   NEG-CTR 09    RP    V       9.57  FALSE      FALSE          9.57 
 7 P3FCz   NEG-CTR 10    RP    V      -6.13  FALSE      FALSE         -6.13 
 8 P3FCz   NEG-CTR 11    RP    V       0.529 FALSE      FALSE          0.529
 9 P3FCz   NEG-CTR 12    RP    V      -7.74  FALSE      FALSE         -7.74 
10 P3FCz   NEG-CTR 13    RP    V       1.27  FALSE      FALSE          1.27 
# ... with 440 more rows

This is how your sentence has been completed. Not only did we identify outliers, but we also applied capping to them. Now decide whether to use the value variable or the cap.value variable for further analysis. The decision is yours.

CodePudding user response:

If your function identify_outliers returns TRUE/FALSE, based on whether or not a given value is an outlier, then you can use filter(identify_outliers(value)) and move on.

  • Related