I have a large data set which I show below:
> str(citibank)
spec_tbl_df [33,912,996 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ tripduration : num [1:33912996] 680 1282 648 631 621 ...
$ starttime : POSIXct[1:33912996], format: "2017-01-01 00:00:21" "2017-01-01 00:00:45" "2017-01-01 00:00:57" "2017-01-01 00:01:10" ...
$ stoptime : POSIXct[1:33912996], format: "2017-01-01 00:11:41" "2017-01-01 00:22:08" "2017-01-01 00:11:46" "2017-01-01 00:11:42" ...
$ start.station.id : num [1:33912996] 3226 3263 3143 3143 3143 ...
$ start.station.name : chr [1:33912996] "W 82 St & Central Park West" "Cooper Square & E 7 St" "5 Ave & E 78 St" "5 Ave & E 78 St" ...
$ start.station.latitude : num [1:33912996] 40.8 40.7 40.8 40.8 40.8 ...
$ start.station.longitude: num [1:33912996] -74 -74 -74 -74 -74 ...
$ end.station.id : num [1:33912996] 3165 498 3152 3152 3152 ...
$ end.station.name : chr [1:33912996] "Central Park West & W 72 St" "Broadway & W 32 St" "3 Ave & E 71 St" "3 Ave & E 71 St" ...
$ end.station.latitude : num [1:33912996] 40.8 40.7 40.8 40.8 40.8 ...
$ end.station.longitude : num [1:33912996] -74 -74 -74 -74 -74 ...
$ bikeid : num [1:33912996] 25542 21136 18147 21211 26819 ...
$ usertype : chr [1:33912996] "Subscriber" "Subscriber" "Customer" "Customer" ...
$ birth.year : int [1:33912996] 1965 1987 NA NA NA 2000 1973 1977 1989 1980 ...
$ gender : num [1:33912996] 2 2 0 0 0 1 1 2 1 1 ...
I used the following dplyr
function to calculate the number of times commuters traveled between two location start.station.id
and end.station.id
:
grouped = citibank %>%
group_by(start.station.id, end.station.id) %>%
summarise(count = n())
And obtained the resulting table shown below:
> grouped
# A tibble: 407,790 × 3
# Groups: start.station.id [866]
start.station.id end.station.id count
<dbl> <dbl> <int>
1 72 72 1621
2 72 79 94
3 72 82 6
4 72 116 49
5 72 127 455
6 72 128 121
7 72 143 5
8 72 144 1
9 72 146 57
10 72 147 7
# … with 407,780 more rows
Now I would like add the columns start.station.latitude
and end.station.latitude
from the citibank
data set to the grouped
tibble, so that they correspond to their respective start.station.id
end.station.id
values in the citibank
data set.
Can someone help me doing this? Thank you in advance.
CodePudding user response:
Could just select relevant columns, use add_count
, and take the unique rows:
citibank %>%
select(start.station.id, end.station.id, start.station.latitude, end.station.latitude) %>%
add_count(start.station.id, end.station.id) %>%
distinct()
CodePudding user response:
Another option is to add a left_join
to your pipe to add the additional columns back in.
library(tidyverse)
citibank %>%
group_by(start.station.id, end.station.id) %>%
summarise(count = n()) %>%
left_join(., citibank %>% select(start.station.id, end.station.id, start.station.latitude, end.station.latitude), by = c("start.station.id", "end.station.id"))
Output
start.station.id end.station.id count start.station.latitude end.station.latitude
<dbl> <dbl> <int> <dbl> <dbl>
1 3143 3152 3 40.8 40.8
2 3143 3152 3 40.8 40.8
3 3143 3152 3 40.8 40.8
4 3163 3163 1 40.8 40.8
5 3226 3165 1 40.8 40.8
6 3263 498 1 40.7 40.7
Data
citibank <- structure(list(tripduration = c(680, 1282, 648, 631, 621, 666
), starttime = structure(c(1483228821, 1483228845, 1483228857,
1483228870, 1483228885, 1483228911), tzone = "UTC", class = c("POSIXct",
"POSIXt")), stoptime = structure(c(1483229501, 1483230128, 1483229506,
1483229502, 1483229507, 1483229577), tzone = "UTC", class = c("POSIXct",
"POSIXt")), start.station.id = c(3226, 3263, 3143, 3143, 3143,
3163), start.station.name = c("W 82 St & Central Park West",
"Cooper Square & E 7 St", "5 Ave & E 78 St", "5 Ave & E 78 St",
"5 Ave & E 78 St", "Central Park West & W 68 St"), start.station.latitude = c(40.78275,
40.7292364991001, 40.7768286343997, 40.7768286343997, 40.7768286343997,
40.7734066), start.station.longitude = c(-73.97137, -73.9908680319786,
-73.9638876914978, -73.9638876914978, -73.9638876914978, -73.97782542
), end.station.id = c(3165, 498, 3152, 3152, 3152, 3163), end.station.name = c("Central Park West & W 72 St",
"Broadway & W 32 St", "3 Ave & E 71 St", "3 Ave & E 71 St", "3 Ave & E 71 St",
"Central Park West & W 68 St"), end.station.latitude = c(40.7757937668367,
40.74854862, 40.76873687, 40.76873687, 40.76873687, 40.7734066
), end.station.longitude = c(-73.9762057363987, -73.98808416,
-73.96119945, -73.96119945, -73.96119945, -73.97782542), bikeid = c(25542,
21136, 18147, 21211, 26819, 16050), usertype = c("Subscriber",
"Subscriber", "Customer", "Customer", "Customer", "Subscriber"
), birth.year = c(1965L, 1987L, NA, NA, NA, 2000L), gender = c(2,
2, 0, 0, 0, 1)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
CodePudding user response:
A third solution which should work - add all four values to count
:D
grouped <- citibank %>%
count(start.station.id,
end.station.id,
start.station.latitude,
end.station.latitude)