Apologise for the rubbish title but struggling to phrase the problem.
I receive data from a client, they manually start and stop tests on site but some times they double press start test or stop test and this makes it hard for me to automate the output.
Here is some data
dput(test)
structure(list(time = structure(c(1645290963, 1645291107, 1645291112,
1645291203, 1645291306, 1645291441, 1645291532, 1645291689, 1645291721,
1645291866, 1645292051, 1645292182, 1645292444, 1645292539, 1645292557,
1645292935, 1645293077, 1645293117, 1645293229, 1645293275, 1645293425,
1645293429, 1645293555, 1645293584, 1645293735), tzone = "", class = c("POSIXct",
"POSIXt")), type = c("StartTesting", "StopTesting", "StartTesting",
"StopTesting", "StartTesting", "StopTesting", "StartTesting",
"StopTesting", "StartTesting", "StopTesting", "StartTesting",
"StopTesting", "StartTesting", "StopTesting", "StopTesting",
"StartTesting", "StopTesting", "StartTesting", "StopTesting",
"StartTesting", "StopTesting", "StartTesting", "StopTesting",
"StartTesting", "StopTesting"), comments = c("", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "")), row.names = c("event.12", "event.13", "event.14",
"event.15", "event.16", "event.17", "event.18", "event.19", "event.20",
"event.21", "event.22", "event.23", "event.25", "event.26", "event.27",
"event.28", "event.29", "event.30", "event.31", "event.32", "event.33",
"event.34", "event.35", "event.36", "event.37"), class = "data.frame")
Difficult to spot but at rowname - event.26 they pressed StopTesting
twice. I need to find a way to ensure there are equal amounts of starts and stops as part of my data processing I am try to clean data as shown below, but it stops me cbind
the data back together.
testdatstart <- filter(test, type =="StartTesting")
names(testdatstart)[names(testdatstart) == 'time'] <- 'TestStart'
names(testdatstart)[names(testdatstart) == 'comments'] <- 'StartComments'
testdatfin <- filter(test, type =="StopTesting")
names(testdatfin)[names(testdatfin) == 'time'] <- 'TestStop'
names(testdatfin)[names(testdatfin) == 'comments'] <- 'StopComments'
testdat <- cbind(testdatstart, testdatfin)
testdat <- testdat %>%
select(TestStart,TestStop, StartComments, StopComments)
and my desired output is this (I manually removed event.26 to achieve this)
dput(testdat)
structure(list(TestStart = structure(c(1645290963, 1645291112,
1645291306, 1645291532, 1645291721, 1645292051, 1645292444, 1645292935,
1645293117, 1645293275, 1645293429, 1645293584), tzone = "", class = c("POSIXct",
"POSIXt")), TestStop = structure(c(1645291107, 1645291203, 1645291441,
1645291689, 1645291866, 1645292182, 1645292539, 1645293077, 1645293229,
1645293425, 1645293555, 1645293735), tzone = "", class = c("POSIXct",
"POSIXt")), StartComments = c("", "", "", "", "", "", "", "",
"", "", "", ""), StopComments = c("", "", "", "", "", "", "",
"", "", "", "", "")), class = "data.frame", row.names = c("event.12",
"event.14", "event.16", "event.18", "event.20", "event.22", "event.25",
"event.28", "event.30", "event.32", "event.34", "event.36"))
So I am struggling to find a way to identify the pattern required, which should run start,stop,start,stop etc and then delete the second off the double entries.
CodePudding user response:
library(tidyverse)
df %>%
mutate(id = cumsum(type == "StartTesting")) %>%
arrange(id, type, desc(time)) %>%
distinct(id, type, .keep_all = T) %>%
pivot_wider(names_from = type, values_from = c(time, comments))
id time_StartTesting time_StopTesting comments_StartTesting comments_StopTesting
<int> <dttm> <dttm> <chr> <chr>
1 1 2022-02-19 18:16:03 2022-02-19 18:18:27 "" ""
2 2 2022-02-19 18:18:32 2022-02-19 18:20:03 "" ""
3 3 2022-02-19 18:21:46 2022-02-19 18:24:01 "" ""
4 4 2022-02-19 18:25:32 2022-02-19 18:28:09 "" ""
5 5 2022-02-19 18:28:41 2022-02-19 18:31:06 "" ""
6 6 2022-02-19 18:34:11 2022-02-19 18:36:22 "" ""
7 7 2022-02-19 18:40:44 2022-02-19 18:42:37 "" ""
8 8 2022-02-19 18:48:55 2022-02-19 18:51:17 "" ""
9 9 2022-02-19 18:51:57 2022-02-19 18:53:49 "" ""
10 10 2022-02-19 18:54:35 2022-02-19 18:57:05 "" ""
11 11 2022-02-19 18:57:09 2022-02-19 18:59:15 "" ""
12 12 2022-02-19 18:59:44 2022-02-19 19:02:15 "" ""
CodePudding user response:
You can use the dplyr
package to and filter
the type column: we simply check if type
is equal to the previous type
using lag
.
Afterwards you can transform your data using summarise
:
library(dplyr)
df %>%
as_tibble(rownames = "rownames") %>%
filter(type != lag(type, default = "")) %>%
group_by(gr = cumsum(type == "StartTesting")) %>%
summarise(rownames = first(rownames), TestStart = time[1], TestStop = time[2], .groups = "drop") %>%
select(-gr)
# A tibble: 12 x 3
rownames TestStart TestStop
<chr> <dttm> <dttm>
1 event.12 2022-02-19 18:16:03 2022-02-19 18:18:27
2 event.14 2022-02-19 18:18:32 2022-02-19 18:20:03
3 event.16 2022-02-19 18:21:46 2022-02-19 18:24:01
4 event.18 2022-02-19 18:25:32 2022-02-19 18:28:09
5 event.20 2022-02-19 18:28:41 2022-02-19 18:31:06
6 event.22 2022-02-19 18:34:11 2022-02-19 18:36:22
7 event.25 2022-02-19 18:40:44 2022-02-19 18:42:19
8 event.28 2022-02-19 18:48:55 2022-02-19 18:51:17
9 event.30 2022-02-19 18:51:57 2022-02-19 18:53:49
10 event.32 2022-02-19 18:54:35 2022-02-19 18:57:05
11 event.34 2022-02-19 18:57:09 2022-02-19 18:59:15
12 event.36 2022-02-19 18:59:44 2022-02-19 19:02:15