I have a dataframe like this:
df <- structure(list(id = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "C", "C", "C", "C", "C",
"C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
"C", "C", "C", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D",
"D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "E", "E",
"E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E",
"E", "E", "E", "E", "E", "E", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "G", "G", "G", "G", "G", "G", "G", "G", "G", "G", "G", "G",
"G", "G", "G", "G", "G", "G", "G", "G", "G", "H", "H", "H", "H",
"H", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H",
"H", "H", "H", "H", "I", "I", "I", "I", "I", "I", "I", "I", "I",
"I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I"),
category = c("Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange"), number = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L,
6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L,
7L, 7L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L,
5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L,
6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L,
7L, 7L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L,
5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L), levels = c("1", "2", "3",
"4", "5", "6", "7"), class = "factor"), value = c(NA, 626.003642929148,
14.8530720820503, 1084.38927965752, 471.655282952508, 64.7236912331857,
776.07706769966, 396.916793965147, 54.4855105782135, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 545.062982509092,
1.91732773771467, NA, NA, NA, NA, NA, NA, NA, NA, NA, 394.894986584073,
195.280219395579, 7.64479280434642, 22.0315874609581, NA,
NA, NA, NA, NA, NA, NA, NA, 1080.64933432694, 805.86292498854,
13.482712704827, NA, NA, NA, NA, NA, NA, 1245.98805385198,
460.473209104431, 7.63424473634, 890.478501410027, 660.395220925587,
9.17592533879766, NA, 299.959907306427, 11.4907173073867,
NA, 491.805140195306, 177.400798058143, NA, NA, NA, NA, NA,
NA, 184.289228960705, 771.021642313767, 6.31043426543333,
281.726609330022, NA, NA, NA, NA, NA, NA, NA, NA, NA, 509.378049364324,
278.038058491692, 527.80894112378, 975.632789695641, 228.599356169827,
760.529109540798, 25.8508640237331, 208.36160360409, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 857.05785408852,
348.960106993208, 95.34862608411, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 643.00490054136,
176.787378197281, 454.641131289842, 875.117995998053, 89.4833304141165,
854.237890262158, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 657.837193971776, 201.407816618736, 218.002918108005,
981.375399584459, 252.181299279728, 254.482264508715, NA,
NA, 237.354256272709, NA, NA, 346.67665285344, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 207.808025637129, 418.411113648792,
30.3048309454833, 990.842297838783, 560.436400795975, 27.8512837727917,
1024.57867280143, 568.935110651167, 26.6536802259542, 887.976034157129,
NA, NA, NA, NA, NA, NA, NA, NA)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -189L), groups = structure(list(
id = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B",
"B", "B", "B", "B", "C", "C", "C", "C", "C", "C", "C", "D",
"D", "D", "D", "D", "D", "D", "E", "E", "E", "E", "E", "E",
"E", "F", "F", "F", "F", "F", "F", "F", "G", "G", "G", "G",
"G", "G", "G", "H", "H", "H", "H", "H", "H", "H", "I", "I",
"I", "I", "I", "I", "I"), number = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L
), levels = c("1", "2", "3", "4", "5", "6", "7"), class = "factor"),
.rows = structure(list(1:3, 4:6, 7:9, 10:12, 13:15, 16:18,
19:21, 22:24, 25:27, 28:30, 31:33, 34:36, 37:39, 40:42,
43:45, 46:48, 49:51, 52:54, 55:57, 58:60, 61:63, 64:66,
67:69, 70:72, 73:75, 76:78, 79:81, 82:84, 85:87, 88:90,
91:93, 94:96, 97:99, 100:102, 103:105, 106:108, 109:111,
112:114, 115:117, 118:120, 121:123, 124:126, 127:129,
130:132, 133:135, 136:138, 139:141, 142:144, 145:147,
148:150, 151:153, 154:156, 157:159, 160:162, 163:165,
166:168, 169:171, 172:174, 175:177, 178:180, 181:183,
184:186, 187:189), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -63L), .drop = TRUE))
As you can see there are many instances where there are NAs in the values column. What I would like to do is a rolling search (within each id), where I take a 3 row window (step of 1), and add a grouping label (in this case labeled ID_group_groupnumber) as soon as I hit 3 rows where there isn't an NA values in the value column. Importantly, one single value for each other category ("Apple","Banana", and "Orange") need to be present in the final group.
Here is what the output should look like, where groups can only be 3 rows, and every time this rule does not apply, an NA value is used as a label instead.
df <- structure(list(id = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "C", "C", "C", "C", "C",
"C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
"C", "C", "C", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D",
"D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "E", "E",
"E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E",
"E", "E", "E", "E", "E", "E", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "G", "G", "G", "G", "G", "G", "G", "G", "G", "G", "G", "G",
"G", "G", "G", "G", "G", "G", "G", "G", "G", "H", "H", "H", "H",
"H", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H",
"H", "H", "H", "H", "I", "I", "I", "I", "I", "I", "I", "I", "I",
"I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I"),
category = c("Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange", "Banana", "Apple",
"Orange", "Banana", "Apple", "Orange"), number = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L,
6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L,
7L, 7L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L,
5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L,
6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L,
7L, 7L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L,
5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L), levels = c("1", "2", "3",
"4", "5", "6", "7"), class = "factor"), value = c(NA, 626.003642929148,
14.8530720820503, 1084.38927965752, 471.655282952508, 64.7236912331857,
776.07706769966, 396.916793965147, 54.4855105782135, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 545.062982509092,
1.91732773771467, NA, NA, NA, NA, NA, NA, NA, NA, NA, 394.894986584073,
195.280219395579, 7.64479280434642, 22.0315874609581, NA,
NA, NA, NA, NA, NA, NA, NA, 1080.64933432694, 805.86292498854,
13.482712704827, NA, NA, NA, NA, NA, NA, 1245.98805385198,
460.473209104431, 7.63424473634, 890.478501410027, 660.395220925587,
9.17592533879766, NA, 299.959907306427, 11.4907173073867,
NA, 491.805140195306, 177.400798058143, NA, NA, NA, NA, NA,
NA, 184.289228960705, 771.021642313767, 6.31043426543333,
281.726609330022, NA, NA, NA, NA, NA, NA, NA, NA, NA, 509.378049364324,
278.038058491692, 527.80894112378, 975.632789695641, 228.599356169827,
760.529109540798, 25.8508640237331, 208.36160360409, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 857.05785408852,
348.960106993208, 95.34862608411, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 643.00490054136,
176.787378197281, 454.641131289842, 875.117995998053, 89.4833304141165,
854.237890262158, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 657.837193971776, 201.407816618736, 218.002918108005,
981.375399584459, 252.181299279728, 254.482264508715, NA,
NA, 237.354256272709, NA, NA, 346.67665285344, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 207.808025637129, 418.411113648792,
30.3048309454833, 990.842297838783, 560.436400795975, 27.8512837727917,
1024.57867280143, 568.935110651167, 26.6536802259542, 887.976034157129,
NA, NA, NA, NA, NA, NA, NA, NA), grouping = c(NA, "A_group1",
"A_group1", "A_group1", "A_group2", "A_group2", "A_group2",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "B_group1", "B_group1",
"B_group1", NA, NA, NA, NA, NA, NA, NA, NA, NA, "C_group1",
"C_group1", "C_group1", NA, NA, NA, NA, NA, NA, "C_group2",
"C_group2", "C_group2", "C_group3", "C_group3", "C_group3",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "D_group1",
"D_group1", "D_group1", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "E_group1", "E_group1", "E_group1", "E_group2", "E_group2",
"E_group2", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "F_group1", "F_group1", "F_group1", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"G_group1", "G_group1", "G_group1", "G_group2", "G_group2",
"G_group2", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "H_group1", "H_group1", "H_group1", "H_group2",
"H_group2", "H_group2", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "I_group1", "I_group1", "I_group1",
"I_group2", "I_group2", "I_group2", "I_group3", "I_group3",
"I_group3", NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
-189L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), groups = structure(list(id = c("A", "A", "A", "A", "A", "A",
"A", "B", "B", "B", "B", "B", "B", "B", "C", "C", "C", "C", "C",
"C", "C", "D", "D", "D", "D", "D", "D", "D", "E", "E", "E", "E",
"E", "E", "E", "F", "F", "F", "F", "F", "F", "F", "G", "G", "G",
"G", "G", "G", "G", "H", "H", "H", "H", "H", "H", "H", "I", "I",
"I", "I", "I", "I", "I"), number = structure(c(1L, 2L, 3L, 4L,
5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L), levels = c("1",
"2", "3", "4", "5", "6", "7"), class = "factor"), .rows = structure(list(
1:3, 4:6, 7:9, 10:12, 13:15, 16:18, 19:21, 22:24, 25:27,
28:30, 31:33, 34:36, 37:39, 40:42, 43:45, 46:48, 49:51, 52:54,
55:57, 58:60, 61:63, 64:66, 67:69, 70:72, 73:75, 76:78, 79:81,
82:84, 85:87, 88:90, 91:93, 94:96, 97:99, 100:102, 103:105,
106:108, 109:111, 112:114, 115:117, 118:120, 121:123, 124:126,
127:129, 130:132, 133:135, 136:138, 139:141, 142:144, 145:147,
148:150, 151:153, 154:156, 157:159, 160:162, 163:165, 166:168,
169:171, 172:174, 175:177, 178:180, 181:183, 184:186, 187:189), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -63L), .drop = TRUE))
I've wrangled the data into this format using expand grid for the columns 'category' and 'number' because I thought it would be easier to identify the groupings. To solve this problem, I thought of pivoting it to wide-form (but then each id is quite different and not sure how I'd group by the columns), and rollapply in the zoo package but not sure how to specify sequentially finding 3 rows without an NA. Importantly no overlap can occur. I appreciate any guidance/approaches to this in r.
Thanks very much.
CodePudding user response:
Number
inputs a vector and outputs 1:3 repeated to the length of that vector.
except that NA's are used at the end if 1:3 does not fit evenly.
Then within group for each run of NA's or not NA's apply Number
using ave
and then sequentially number the triples within id. At the end we make the grouping
names and NA out the grouping entries that should be NA.
library(dplyr)
library(data.table) # rleid
Number <- function(x) {
r <- rep(1:3, length = length(x))
n <- 3 * (length(r) %/% 3)
ifelse(seq_along(x) > n, NA, r)
}
result <- df %>%
group_by(id) %>%
mutate(grouping = ave(1:n(), rleid(!is.na(value)), FUN = Number) 0*value,
grouping = cumsum(coalesce(grouping, 0) == 1) 0 * grouping) %>%
ungroup %>%
mutate(grouping = ifelse(is.na(grouping), NA, paste0(id, "_group", grouping)))