I am reviewing a dataset on flights and airport details. The dataset contains the following columns with the data about airport and flight details.
flight_date | op_career | tail_num | flight_num | origin | origin_airport_ID | dest_airport_ID | dest | ITIN_ID |
---|---|---|---|---|---|---|---|---|
03/13/2019 | AA | N900EV | 3503 | SFO | 11308 | 10397 | LAX | 201913000000 |
03/13/2019 | AA | N900EZ | 3502 | SFO | 11308 | 10397 | LAX | 209134000001 |
03/13/2019 | AS | N686BR | 3397 | SFO | 11308 | 10397 | LAX | 201965635145 |
03/13/2019 | SWA | N686BR | 3397 | ATL | 11151 | 10397 | LAX | 201965635145 |
03/13/2019 | JBU | N902FJ | 5806 | SFO | 11308 | 10397 | LAX | 205574511888 |
03/13/2019 | JBU | N902FJ | 5807 | JFK | 12953 | 11151 | ATL | 205574511888 |
03/13/2019 | AA | N932LR | 5804 | JFK | 12953 | 11151 | ATL | 205156495361 |
03/13/2019 | YV | N932LR | 5804 | SFO | 11308 | 10397 | LAX | 205156495361 |
03/14/2019 | DL | 255NV | 515 | SFO | 11308 | 10397 | LAX | 25145121551 |
03/14/2019 | DL | 225NV | 517 | ATL | 11151 | 11980 | GRI | 0450120125 |
03/13/2019 | DL | N686BR | 3397 | JFK | 12953 | 13451 | ORL | 478842166 |
03/13/2019 | AA | N902FJ | 5801 | ACT | 11666 | 03971 | BWI | 51218125 |
03/13/2019 | AS | N902FJ | 5805 | BGI | 74515 | 10397 | LAX | 4215112374 |
03/13/2019 | SW | N902FJ | 5809 | LAX | 10397 | 16847 | BGI | 4215112374 |
03/13/2019 | AA | N932LR | 5802 | ACV | 11695 | 05413 | PHX | 121545451 |
03/13/2019 | SW | N932LR | 5808 | BQK | 52141 | 18521 | SEA | 000215454 |
03/14/2019 | SW | 255NV | 515 | LKD | 11980 | 10397 | LAX | 251212121851 |
03/14/2019 | SW | 255NV | 515 | ATL | 11151 | 13451 | ORL | 251212121851 |
03/14/2019 | AA | 225NV | 517 | ATL | 11151 | 13451 | ORL | 121121511121 |
03/14/2019 | UA | 225NV | 517 | ATL | 11151 | 13451 | ORL | 121121511121 |
I want to find the busiest trip routes in terms of number of trip flights
so in this case a summary output would look something like this
Route | total_flights |
---|---|
SFO-LAX | 6 |
JFK-ATL | 2 |
ATL-ORL | 3 |
I am not sure how to code this.
Any help would be greatly appreciated. Thanks in advance :)
CodePudding user response:
Another approach with just dplyr
:
library(dplyr)
df1 %>%
mutate(Route = paste0(origin, "-", dest)) %>%
group_by(Route) %>%
summarize(total_flights = n()) %>%
arrange(desc(total_flights)) %>%
top_n(3)
Route total_flights
<chr> <int>
1 SFO-LAX 6
2 ATL-ORL 3
3 JFK-ATL 2
CodePudding user response:
We may use count
library(dplyr)
library(tidyr)
df1 %>%
count(origin, dest, sort = TRUE, name = 'total_flights') %>%
head(3) %>%
unite(Route, origin, dest, sep = '-')
-output
Route total_flights
1 SFO-LAX 6
2 ATL-ORL 3
3 JFK-ATL 2
data
df1 <- structure(list(flight_date = c("03/13/2019", "03/13/2019", "03/13/2019",
"03/13/2019", "03/13/2019", "03/13/2019", "03/13/2019", "03/13/2019",
"03/14/2019", "03/14/2019", "03/13/2019", "03/13/2019", "03/13/2019",
"03/13/2019", "03/13/2019", "03/13/2019", "03/14/2019", "03/14/2019",
"03/14/2019", "03/14/2019"), op_career = c("AA", "AA", "AS",
"SWA", "JBU", "JBU", "AA", "YV", "DL", "DL", "DL", "AA", "AS",
"SW", "AA", "SW", "SW", "SW", "AA", "UA"), tail_num = c("N900EV",
"N900EZ", "N686BR", "N686BR", "N902FJ", "N902FJ", "N932LR", "N932LR",
"255NV", "225NV", "N686BR", "N902FJ", "N902FJ", "N902FJ", "N932LR",
"N932LR", "255NV", "255NV", "225NV", "225NV"), flight_num = c(3503L,
3502L, 3397L, 3397L, 5806L, 5807L, 5804L, 5804L, 515L, 517L,
3397L, 5801L, 5805L, 5809L, 5802L, 5808L, 515L, 515L, 517L, 517L
), origin = c("SFO", "SFO", "SFO", "ATL", "SFO", "JFK", "JFK",
"SFO", "SFO", "ATL", "JFK", "ACT", "BGI", "LAX", "ACV", "BQK",
"LKD", "ATL", "ATL", "ATL"), origin_airport_ID = c(11308L, 11308L,
11308L, 11151L, 11308L, 12953L, 12953L, 11308L, 11308L, 11151L,
12953L, 11666L, 74515L, 10397L, 11695L, 52141L, 11980L, 11151L,
11151L, 11151L), dest_airport_ID = c(10397L, 10397L, 10397L,
10397L, 10397L, 11151L, 11151L, 10397L, 10397L, 11980L, 13451L,
3971L, 10397L, 16847L, 5413L, 18521L, 10397L, 13451L, 13451L,
13451L), dest = c("LAX", "LAX", "LAX", "LAX", "LAX", "ATL", "ATL",
"LAX", "LAX", "GRI", "ORL", "BWI", "LAX", "BGI", "PHX", "SEA",
"LAX", "ORL", "ORL", "ORL"), ITIN_ID = c(2.01913e 11, 209134000001,
201965635145, 201965635145, 205574511888, 205574511888, 205156495361,
205156495361, 25145121551, 450120125, 478842166, 51218125, 4215112374,
4215112374, 121545451, 215454, 251212121851, 251212121851, 121121511121,
121121511121)), class = "data.frame", row.names = c(NA, -20L))