Home > Blockchain >  `dplyr` results change when filtering and separate_rows
`dplyr` results change when filtering and separate_rows

Time:08-03

I have data on pupillary responses during Questions in conversation. The pupil data are stringed together in columns A*, B*, and C*, with columns ending in *intpl_new showing the interpolated pupil area values and the columns ending with *dur giving the durations of each pupil observation. Dataframe df contains the data for two illustrative Question Sequences:

df <- structure(list(Speaker = c("ID08.A", "ID08.A"), Utterance = c("what so you're going to a £!party! in Italy£=", 
                                                                    "so what's your work"), Sequ = c(379, 380), File = c("F08", "F08"
                                                                    ), A_pupil_dur = c("9,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,6", 
                                                                                       "4,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,16"
                                                                    ), B_pupil_dur = c("5,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,10", 
                                                                                       "0,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,3"
                                                                    ), C_pupil_dur = c("13,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,2", 
                                                                                       "8,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,12"
                                                                    ), A_Area_av_intpl_new = c("439.711, 439.711, 644.691, 731.96, 747.196, 748.7275, 754.4505, 762.587, 760.0615, 767.383, 770.2685, 775.8755, 770.246, 780.847, 785.325, 787.0505, 796.164, 792.5955, 803.6015, 805.0865, 812.2755, 811.0335, 817.3775, 809.7155, 804.172, 784.5615, 785.306, 773.668, 787.1165, 780.432, 770.338, 772.0045, 775.238, 765.1055, 770.028, 765.566, 761.882, 762.095, 764.063, 769.38, 767.2175, 787.711, 773.056, 794.4395, 775.7075, 772.3825, 764.177, 759.029, 758.788, 756.673, 755.184, 762.655, 777.8965, 787.1845, 782.783, 779.6265, 790.063, 790.716, 792.483, 813.861, 796.0765, 793.4735, 786.3455, 782.2775, 782.3615, 765.5825, 748.5415, 733.5605, 740.209, 716.731, 714.564, 692.6865, 677.8545, 663.384, 653.542, 661.2205, 642.244, 630.52, 634.3305, 623.638, 614.8775, 598.457, 550.281, 533.3955, 534.857, 556.981, 520.699, 504.2235, 483.121, 533.9275, 517.6205, 564.656, 486.9505, 496.907, 482.0425, 498.3545, 488.9415, 514.838, 485.5285, 458.243, 447.065, 429.248, 408.462, 381.2095, 366.082, 361.209, 351.755, 366.533, 365.6965, 372.575, 381.1985, 374.6075, 374.187, 378.831, 377.856, 380.95, 374.515, 386.3795, 372.8235, 367.6245, 311.0305, 285.302, 285.302", 
                                                                                               "450.732, 450.732, 450.732, 450.732, 450.732, 673.6895, 673.2105, 681.709, 690.044, 685.161, 694.2225, 690.4845, 693.8295, 702.695, 706.5325, 712.255, 706.277, 715.655, 716.3665, 715.771, 738.173, 746.6575, 753.729, 741.008, 747.346, 739.562, 740.895, 741.728, 746.875, 745.368, 750.3755, 740.992, 746.3275, 747.992, 745.482, 737.803, 732.567, 665.135, 590.805, 599.918, 517.5955, 232.453, 185.6525, 177.0585, 349.1175, 610.161, 715.3055, 723.8695, 731.505, 736.0385"
                                                                    ), B_Area_av_intpl_new = c("1127.5655, 1133.472, 1144.686, 1137.3125, 1146.6885, 1146.521, 1146.165, 1136.645, 1107.046, 1169.5485, 1171.6375, 1185.374, 911.0355, 514.7785, 302.262, 201.9995, 221.0165, 364.1835, 554.6255, 830.8855, 982.6595, 1055.199, 1236.219, 1313.743, 1347.7445, 1341.755, 1367.371, 1058.6165, 816.283875, 573.95125, 331.618625, 89.286, 97.243, 783.2605, 921.9195, 1023.9135, 1099.347, 1170.8585, 1191.3015, 1218.248, 1232.114, 1248.672, 1269.5065, 1274.4395, 1284.665, 1289.562, 1290.2045, 1274.3455, 1257.694, 1251.4575, 1241.0405, 1217.684, 1202.611, 1180.6035, 1161.036, 1140.593, 1127.1275, 1108.0965, 1091.227, 1076.481, 1060.9835, 1047.0645, 1026.973, 1020.352, 1010.53, 1001.069, 988.176, 987.625, 979.6735, 966.375, 965.7815, 949.6985, 944.108, 942.71, 938.5365, 936.1955, 932.9745, 928.864, 932.069, 931.5345, 928.105, 925.708, 924.489, 931.071, 939.366, 936.182, 939.049, 940.7355, 943.43, 951.3945, 953.896, 960.797, 964.893, 976.1915, 973.9215, 986.8575, 988.273, 999.3865, 1008.348, 1012.5235, 1026.1885, 1024.859, 1034.6715, 1040.397, 1053.101, 1061.7265, 1069.12, 1072.339, 1073.477, 1080.543, 1081.047, 1089.56, 1094.2435, 1099.5435, 1098.3245, 1098.1435, 1103.4685, 1112.3565, 1111.2075, 1119.047, 1111.2515, 1106.309, 1109.1225", 
                                                                                               "NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA"
                                                                    ), C_Area_av_intpl_new = c("1310.6215, 1315.454, 1322.456, 1318.831, 1323.633, 1320.5265, 1321.4315, 1321.8755, 1327.908, 1328.7995, 1327.1185, 1328.311, 1329.5015, 1275.664, 1318.0675, 1295.021, 1358.674, 1313.429, 1296.748, 1279.574, 1260.8705, 1253.4135, 1275.716, 1256.1905, 1260.968, 1266.579, 1267.513, 1265.8265, 1262.5765, 1252.0955, 1218.903, 1188.4015, 1189.9125, 1171.238, 1160.559, 1146.503, 1139.204, 1132.484, 1122.7765, 1117.295, 1101.205, 1100.462, 1094.296, 1086.83, 1077.7585, 1071.4795, 1064.321, 1066.358, 1026.922, 1055.0745, 1035.7405, 1047.858, 1033.132, 1039.793, 1021.0405, 1029.5575, 1027.678, 1029.3585, 1026.111, 1028.1745, 1025.4395, 1030.1245, 1022.905, 1027.13, 1034.6355, 1036.9165, 1031.264, 1040.098, 1035.224, 1038.8585, 1036.9795, 1046.472, 1049.66, 1051.9625, 1057.0575, 1057.5905, 1057.0835, 1064.5255, 1066.282, 1062.145, 1068.6555, 1068.9065, 1070.3065, 1075.3095, 1077.345, 1076.921, 1082.6835, 1080.1855, 1087.028, 1091.712, 1089.9055, 1097.7805, 1092.521, 1096.772, 1095.7135, 1096.116, 1099.6245, 1094.1805, 1096.947, 1100.037, 1095.832, 1101.221, 1097.206, 1102.7895, 1097.554, 1100.1235, 1105.832, 1101.3535, 1093.1925, 1104.321, 1097.7195, 1074.774, 1075.12, 806.9485, 272.4795, 270.039625, 267.59975, 265.159875, 262.72, 175.0135, 126.3875, 126.3875, 126.3875", 
                                                                                               "1135.087, 1126.958, 1134.757, 1137.2935, 1144.709, 1143.489, 1120.5105, 1168.576, 1175.9775, 1207.8175, 1169.8565, 1187.4535, 1191.8855, 1208.294, 1222.445, 1217.7895, 1213.134, 1216.726, 1223.882, 1214.1995, 1171.062, 1144.856, 1150.527, 1153.6485, 1133.79, 1123.989, 1116.856, 1110.5225, 1098.6115, 1087.8905, 1080.286, 1077.1005, 1067.366, 1058.493, 1048.9075, 1050.1335, 1039.5365, 1036.7075, 1033.1095, 1025.224, 1022.4685, 1017.6755, 1018.1755, 1012.436, 1009.0545, 1010.995, 1009.37, 1010.7515, 1012.3335, 1006.5675"
                                                                    )), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
                                                                    ))

I'm stuck in a seemingly simple task: to find the maximum *intpl_new value for each participant A, B, and C and the time when that maximum value occurs. When I filter Sequ==379, everything works out fine with this code:

df %>%
  
  # Filter `Sequ`:
  filter(Sequ == 379) %>%
  
  # cast longer:
  pivot_longer(cols = matches("(_Area_av_intpl_new|pupil_dur)$"),
               names_to = c("PP_by", ".value"),
               # 1. capt. gr.: initial character ("A", "B", or "C"); 
               # 2. capt. gr.: everything after underscore                
               names_pattern = "^(.)_(.*)$") %>% 
  # separate each `Area_av_intpl` and corresponding `PP_by` value into their own row:
  separate_rows(c(Area_av_intpl_new, pupil_dur), sep = ",", convert = TRUE) %>%
  # rename:
  rename(Area = Area_av_intpl_new, PP_dur = pupil_dur) %>% 
  
  # Remove `NA` and " NA" (with preceding whitespace!):
  filter(!(is.na(Area) | Area == " NA")) %>%
  
  ### Create `PP_dur_cumsum`:
  group_by(Utterance, PP_by, Sequ) %>%
  mutate(PP_dur_cumsum = cumsum(PP_dur)) %>%
  
  ### Find maximal PP values:
  summarise(across(c(Speaker,File), first),
    max_Area_Q = max(Area),
    time_max_Area_Q = PP_dur_cumsum[Area == max_Area_Q]) %>% 
  arrange(Sequ) %>%
  
  ### Put new values back onto unique `Utterance` row:
  pivot_wider( 
    names_from = PP_by, 
    names_glue = "{PP_by}_{.value}",
    values_from = c(time_max_Area_Q, max_Area_Q)) %>% 
  ungroup()
# A tibble: 1 × 10
  Utterance               Sequ Speaker File  A_time_max_Area_Q B_time_max_Area… C_time_max_Area… A_max_Area_Q B_max_Area_Q C_max_Area_Q
  <chr>                  <dbl> <chr>   <chr>             <int>            <int>            <int>        <dbl>        <dbl>        <dbl>
1 what so you're going …   379 ID08.A  F08                 375              438              279         817.        1367.        1359.

However, when I do not filter (i.e., comment out the filter operation) but run the code over both Sequ, the values get incorrect:

# A tibble: 2 × 10
  Utterance               Sequ Speaker File  A_time_max_Area_Q B_time_max_Area… C_time_max_Area… A_max_Area_Q B_max_Area_Q C_max_Area_Q
  <chr>                  <dbl> <chr>   <chr>             <int>            <int>            <int> <chr>        <chr>        <chr>       
1 what so you're going …   379 ID08.A  F08                   9                5               13 439.711      1127.5655    1310.6215   
2 so what's your work      380 ID08.A  F08                   4               NA                8 450.732      NA           1135.087 

My hunch is that the incorrect values are caused by the NA values in Sequ==380. But I can't seem to find the right way to handle them and to find a right solution. Help with this task and this admittedly difficult type of data is much appreciated!

CodePudding user response:

Indeed, the error occurs because separate_rows(..., convert = TRUE) doesn't convert to numeric when chars are present. Consider df2 as a base case:

df2 <- df %>%
  # Filter `Sequ`:
  filter(Sequ == 379) %>%
  
  # cast longer:
  pivot_longer(cols = matches("(_Area_av_intpl_new|pupil_dur)$"),
               names_to = c("PP_by", ".value"),
               # 1. capt. gr.: initial character ("A", "B", or "C"); 
               # 2. capt. gr.: everything after underscore                
               names_pattern = "^(.)_(.*)$") %>% 
  # separate each `Area_av_intpl` and corresponding `PP_by` value into their own row:
separate_rows(c(Area_av_intpl_new, pupil_dur), sep = ",", convert = TRUE) %>%
  # rename:
  rename(Area = Area_av_intpl_new, PP_dur = pupil_dur)

And df3 the unfiltered tibble:

df3 <- df %>%
  # cast longer:
  pivot_longer(cols = matches("(_Area_av_intpl_new|pupil_dur)$"),
               names_to = c("PP_by", ".value"),
               # 1. capt. gr.: initial character ("A", "B", or "C"); 
               # 2. capt. gr.: everything after underscore                
               names_pattern = "^(.)_(.*)$") %>% 
  # separate each `Area_av_intpl` and corresponding `PP_by` value into their own row:
  separate_rows(c(Area_av_intpl_new, pupil_dur), sep = ",", convert = TRUE) %>%
  # rename:
  rename(Area = Area_av_intpl_new, PP_dur = pupil_dur)

res2 <- df3 %>% filter(Sequ == 379) 

This isn't identical because Area is of character.

identical(df2, res2)
[1] FALSE

Adding an explicit conversion after the filter fixes the problem.

df3 <- df %>%
  # cast longer:
  pivot_longer(cols = matches("(_Area_av_intpl_new|pupil_dur)$"),
               names_to = c("PP_by", ".value"),
               # 1. capt. gr.: initial character ("A", "B", or "C"); 
               # 2. capt. gr.: everything after underscore                
               names_pattern = "^(.)_(.*)$") %>% 
  separate_rows(c(Area_av_intpl_new, pupil_dur), sep = ",", convert = TRUE) %>%
  rename(Area = Area_av_intpl_new, PP_dur = pupil_dur) %>% 
  # filtering
  filter(!(is.na(Area) | Area == " NA")) %>%
  # ...and then converting 
  mutate(Area=as.numeric(Area))


res2 <- df3 %>% filter(Sequ == 379) 
identical(df2, res2)
[1] TRUE

This allows us to continue chaining, and yields your expected result.

df3 %>%       
group_by(Utterance, PP_by, Sequ) %>%
  mutate(PP_dur_cumsum = cumsum(PP_dur)) %>%
  
  ### Find maximal PP values:
  summarise(across(c(Speaker,File), first),
    max_Area_Q = max(Area),
    time_max_Area_Q = PP_dur_cumsum[Area == max_Area_Q]) %>% 
  arrange(Sequ) %>%
  
  ### Put new values back onto unique `Utterance` row:
  pivot_wider( 
    names_from = PP_by, 
    names_glue = "{PP_by}_{.value}",
    values_from = c(time_max_Area_Q, max_Area_Q)) %>% 
  ungroup()
# A tibble: 2 × 10
  Utterance                                      Sequ Speaker File  A_time_max_Area_Q B_time_max_Area_Q C_time_max_Area_Q A_max_Area_Q
  <chr>                                         <dbl> <chr>   <chr>             <int>             <int>             <int>        <dbl>
1 what so you're going to a £!party! in Italy£=   379 ID08.A  F08                 375               438               279         817.
2 so what's your work                             380 ID08.A  F08                 372                NA               308         754.
  B_max_Area_Q C_max_Area_Q
         <dbl>        <dbl>
1        1367.        1359.
2          NA         1224.

The values you saw were probably representing levels of factor conversion from character.

For added clarity, I would suggest creating intermediate results if memory is not an issue.

  •  Tags:  
  • r na
  • Related