how to find the busiest airports from a dataset in R?-CodePudding

I am reviewing a dataset on flights and airport details. The dataset contains the following columns with the data about airport and flight details.

flight_date	op_career	tail_num	flight_num	origin	origin_airport_ID	dest_airport_ID	dest	ITIN_ID
03/13/2019	AA	N900EV	3503	SFO	11308	10397	LAX	201913000000
03/13/2019	AA	N900EZ	3502	SFO	11308	10397	LAX	209134000001
03/13/2019	AS	N686BR	3397	SFO	11308	10397	LAX	201965635145
03/13/2019	SWA	N686BR	3397	ATL	11151	10397	LAX	201965635145
03/13/2019	JBU	N902FJ	5806	SFO	11308	10397	LAX	205574511888
03/13/2019	JBU	N902FJ	5807	JFK	12953	11151	ATL	205574511888
03/13/2019	AA	N932LR	5804	JFK	12953	11151	ATL	205156495361
03/13/2019	YV	N932LR	5804	SFO	11308	10397	LAX	205156495361
03/14/2019	DL	255NV	515	SFO	11308	10397	LAX	25145121551
03/14/2019	DL	225NV	517	ATL	11151	11980	GRI	0450120125
03/13/2019	DL	N686BR	3397	JFK	12953	13451	ORL	478842166
03/13/2019	AA	N902FJ	5801	ACT	11666	03971	BWI	51218125
03/13/2019	AS	N902FJ	5805	BGI	74515	10397	LAX	4215112374
03/13/2019	SW	N902FJ	5809	LAX	10397	16847	BGI	4215112374
03/13/2019	AA	N932LR	5802	ACV	11695	05413	PHX	121545451
03/13/2019	SW	N932LR	5808	BQK	52141	18521	SEA	000215454
03/14/2019	SW	255NV	515	LKD	11980	10397	LAX	251212121851
03/14/2019	SW	255NV	515	ATL	11151	13451	ORL	251212121851
03/14/2019	AA	225NV	517	ATL	11151	13451	ORL	121121511121
03/14/2019	UA	225NV	517	ATL	11151	13451	ORL	121121511121

I want to find the busiest trip routes in terms of number of trip flights

so in this case a summary output would look something like this

Route	total_flights
SFO-LAX	6
JFK-ATL	2
ATL-ORL	3

I am not sure how to code this.

Any help would be greatly appreciated. Thanks in advance :)

CodePudding user response：

Another approach with just dplyr:

library(dplyr)

df1 %>% 
  mutate(Route = paste0(origin, "-", dest)) %>% 
  group_by(Route) %>% 
  summarize(total_flights = n()) %>% 
  arrange(desc(total_flights)) %>% 
  top_n(3)

  Route   total_flights
  <chr>           <int>
1 SFO-LAX             6
2 ATL-ORL             3
3 JFK-ATL             2

CodePudding user response：

We may use count

library(dplyr)
library(tidyr)
df1 %>%
  count(origin, dest, sort = TRUE, name = 'total_flights') %>% 
  head(3) %>%
  unite(Route, origin, dest, sep = '-')

-output

    Route total_flights
1 SFO-LAX             6
2 ATL-ORL             3
3 JFK-ATL             2

data

df1 <- structure(list(flight_date = c("03/13/2019", "03/13/2019", "03/13/2019", 
"03/13/2019", "03/13/2019", "03/13/2019", "03/13/2019", "03/13/2019", 
"03/14/2019", "03/14/2019", "03/13/2019", "03/13/2019", "03/13/2019", 
"03/13/2019", "03/13/2019", "03/13/2019", "03/14/2019", "03/14/2019", 
"03/14/2019", "03/14/2019"), op_career = c("AA", "AA", "AS", 
"SWA", "JBU", "JBU", "AA", "YV", "DL", "DL", "DL", "AA", "AS", 
"SW", "AA", "SW", "SW", "SW", "AA", "UA"), tail_num = c("N900EV", 
"N900EZ", "N686BR", "N686BR", "N902FJ", "N902FJ", "N932LR", "N932LR", 
"255NV", "225NV", "N686BR", "N902FJ", "N902FJ", "N902FJ", "N932LR", 
"N932LR", "255NV", "255NV", "225NV", "225NV"), flight_num = c(3503L, 
3502L, 3397L, 3397L, 5806L, 5807L, 5804L, 5804L, 515L, 517L, 
3397L, 5801L, 5805L, 5809L, 5802L, 5808L, 515L, 515L, 517L, 517L
), origin = c("SFO", "SFO", "SFO", "ATL", "SFO", "JFK", "JFK", 
"SFO", "SFO", "ATL", "JFK", "ACT", "BGI", "LAX", "ACV", "BQK", 
"LKD", "ATL", "ATL", "ATL"), origin_airport_ID = c(11308L, 11308L, 
11308L, 11151L, 11308L, 12953L, 12953L, 11308L, 11308L, 11151L, 
12953L, 11666L, 74515L, 10397L, 11695L, 52141L, 11980L, 11151L, 
11151L, 11151L), dest_airport_ID = c(10397L, 10397L, 10397L, 
10397L, 10397L, 11151L, 11151L, 10397L, 10397L, 11980L, 13451L, 
3971L, 10397L, 16847L, 5413L, 18521L, 10397L, 13451L, 13451L, 
13451L), dest = c("LAX", "LAX", "LAX", "LAX", "LAX", "ATL", "ATL", 
"LAX", "LAX", "GRI", "ORL", "BWI", "LAX", "BGI", "PHX", "SEA", 
"LAX", "ORL", "ORL", "ORL"), ITIN_ID = c(2.01913e 11, 209134000001, 
201965635145, 201965635145, 205574511888, 205574511888, 205156495361, 
205156495361, 25145121551, 450120125, 478842166, 51218125, 4215112374, 
4215112374, 121545451, 215454, 251212121851, 251212121851, 121121511121, 
121121511121)), class = "data.frame", row.names = c(NA, -20L))