I have a large dataset which logs id
across the day at a specific location
What I would like to do is plot the cumulative number of visits by each individual id
over the time period the data were collected.
A sample of the data looks like this, the full data set has visits over a number of days.
I've tried some variations using cumsum
but just can't get it to work.
dput(df)
structure(list(date = c("06/01/2021", "06/01/2021", "06/01/2021",
"06/01/2021", "06/01/2021", "06/01/2021", "06/01/2021", "06/01/2021",
"06/01/2021", "06/01/2021", "06/01/2021", "06/01/2021", "06/01/2021",
"06/01/2021", "06/01/2021", "06/01/2021", "06/01/2021", "06/01/2021",
"06/01/2021", "06/01/2021", "06/01/2021", "06/01/2021", "06/01/2021",
"06/01/2021", "06/01/2021", "06/01/2021", "06/01/2021", "06/01/2021",
"06/01/2021", "06/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021", "07/01/2021",
"07/01/2021", "07/01/2021", "08/01/2021", "08/01/2021", "08/01/2021",
"08/01/2021", "08/01/2021", "08/01/2021", "08/01/2021", "08/01/2021",
"08/01/2021", "08/01/2021", "08/01/2021", "08/01/2021", "08/01/2021",
"08/01/2021"), time = c("08:02:54", "08:04:48", "08:04:49", "08:05:49",
"08:05:50", "08:05:50", "08:05:51", "08:06:32", "08:06:33", "08:07:34",
"08:07:34", "08:07:35", "08:07:36", "08:07:36", "08:09:52", "08:09:53",
"08:09:53", "08:10:02", "08:10:04", "08:10:05", "08:10:05", "08:10:07",
"08:10:08", "08:10:22", "08:10:42", "08:10:43", "08:11:14", "08:11:15",
"08:11:38", "08:11:39", "08:11:39", "08:11:40", "08:11:40", "08:11:41",
"08:11:48", "08:11:50", "08:11:51", "08:11:51", "08:11:52", "08:11:53",
"08:11:54", "08:11:54", "08:12:36", "08:12:37", "08:12:38", "08:12:38",
"08:13:25", "08:13:25", "08:14:09", "08:14:18", "08:14:19", "08:14:24",
"08:14:24", "08:14:25", "08:14:37", "08:14:38", "08:14:58", "08:14:58",
"08:14:59", "08:14:59", "08:15:03", "08:15:04", "08:15:04", "08:15:05",
"08:15:12", "08:15:13", "08:15:13", "08:15:33", "08:15:34", "08:15:37",
"08:15:39", "08:15:51", "08:16:12", "08:16:13", "08:16:14", "08:16:31",
"08:16:32", "08:16:42", "08:17:00", "08:17:00", "08:17:01", "08:17:03",
"08:17:19", "08:17:20", "08:17:22", "08:17:26", "08:17:26", "08:17:27",
"08:17:27", "08:17:32", "08:17:32", "08:17:33", "08:17:50", "08:17:51",
"08:17:51", "08:17:52", "08:18:38", "08:18:39", "08:18:39", "08:18:40",
"08:18:41", "08:18:41", "08:19:44", "08:19:44", "08:19:46", "08:19:46",
"08:22:27", "08:23:20", "08:23:20", "08:23:47", "08:23:48", "08:23:48",
"08:23:52", "08:23:52"), id = c(2L, 3L, 2L, 3L, 4L, 5L, 3L, 4L,
3L, 2L, 3L, 3L, 2L, 4L, 5L, 2L, 3L, 2L, 2L, 2L, 4L, 3L, 2L, 2L,
4L, 5L, 3L, 2L, 4L, 5L, 3L, 3L, 4L, 5L, 6L, 4L, 3L, 5L, 4L, 5L,
4L, 3L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L,
5L, 3L, 2L, 4L, 5L, 3L, 2L, 2L, 4L, 5L, 3L, 3L, 4L, 5L, 6L, 4L,
3L, 5L, 4L, 5L, 4L, 3L, 2L, 2L, 3L, 2L, 4L, 5L, 3L, 3L, 4L, 5L,
6L, 4L, 3L, 5L, 4L, 5L, 4L, 3L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 3L,
2L, 4L, 5L, 3L, 3L, 4L, 5L, 6L, 4L, 3L)), class = "data.frame", row.names = c(NA,
-114L))
head(df)
date time id
1 06/01/2021 08:02:54 2
2 06/01/2021 08:04:48 3
3 06/01/2021 08:04:49 2
4 06/01/2021 08:05:49 3
5 06/01/2021 08:05:50 4
6 06/01/2021 08:05:50 5
CodePudding user response:
Do you mean something like that?
Using lubridate
to convert your data in datetime object (easier to handle), then cumsum(!duplicated(datetime))
is used to count the number of (unique) visit by id
. Then it is plotted with ggplot2
.
The last line allows you to modify the x-axis
breaks.
df %>%
mutate(datetime = as_datetime(paste(as.Date(date, "%d/%m/%y"), time))) %>%
group_by(id) %>%
mutate(cumsum = cumsum(!duplicated(datetime))) %>%
ggplot(aes(x = datetime, y = cumsum, color = factor(id), group = id))
geom_line()
scale_x_datetime(breaks = scales::date_breaks("1 day"), date_labels = "%D - %H:%M")
CodePudding user response:
A ggplot()
plot solution considering data as a factor variable for specific and for all time steps.
Cumulative visits by id
and date
:
library(data.table)
dt=as.data.table(df)
dd<-dt[ , count := .N, by = .(id, date)]
dd$date<-as.factor(dd$date)
Create the plot:
ggplot(dd, aes(y=id, x=time, fill=count))
geom_tile()
scale_x_discrete(breaks = c("08:02:54","08:05:50", "08:07:34","08:10:02","08:13:25","08:16:32","08:19:44","08:23:52")) # remove this for all time-steps
facet_wrap(~date)
scale_fill_gradient(low="lightyellow", high="red")
labs(x="Time", y="Id", title="", fill="Number of visits")
theme_bw()
theme(plot.title = element_text(hjust = 0.5, face="bold", size=20, color="black"))
theme(axis.title.x = element_text(family="Times", face="bold", size=16, color="black"))
theme(axis.title.y = element_text(family="Times", face="bold", size=16, color="black"))
theme(axis.text.x = element_text( hjust = 1, face="bold", size=14, color="black", angle=90) )
theme(axis.text.y = element_text( hjust = 1, face="bold", size=14, color="black") )
theme(plot.title = element_text(hjust = 0.5))
theme(legend.title = element_text(family="Times", color = "black", size = 16,face="bold"),
legend.text = element_text(family="Times", color = "black", size = 14,face="bold"),
legend.position="right",
plot.title = element_text(hjust = 0.5))
theme(strip.text.x = element_text(size = 16, colour = "black",family="Times", face="bold"))
or without face_wrap()