I'm just learning web scraping & want to output the result of this website to a csv file https://www.avbuyer.com/aircraft/private-jets
but am struggling with parsing the next pages
here is my code (with help of Amen Aziz) which only gives me the 1st page
I'm using Chrome so not sure if it makes any difference
I'm running Python 3.8.12
Thank you in advance
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers= {'User-Agent': 'Mozilla/5.0'}
response = requests.get('https://www.avbuyer.com/aircraft/private-jets')
soup = BeautifulSoup(response.content, 'html.parser')
postings = soup.find_all('div', class_ = 'listing-item premium')
temp=[]
for post in postings:
link = post.find('a', class_ = 'more-info').get('href')
link_full = 'https://www.avbuyer.com' link
plane = post.find('h2', class_ = 'item-title').text
price = post.find('div', class_ = 'price').text
location = post.find('div', class_ = 'list-item-location').text
desc = post.find('div', class_ = 'list-item-para').text
try:
tag = post.find('div', class_ = 'list-viewing-date').text
except:
tag = 'N/A'
updated = post.find('div', class_ = 'list-update').text
t=post.find_all('div',class_='list-other-dtl')
for i in t:
data=[tup.text for tup in i.find_all('li')]
years=data[0]
s=data[1]
total_time=data[2]
temp.append([plane,price,location,years,s,total_time,desc,tag,updated,link_full])
df=pd.DataFrame(temp,columns=["plane","price","location","Year","S/N","Totaltime","Description","Tag","Last Updated","link"])
next_page = soup.find('a', {'rel':'next'}).get('href')
next_page_full = 'https://www.avbuyer.com' next_page
next_page_full
url = next_page_full
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
df.to_csv('/Users/xxx/avbuyer.csv')
CodePudding user response:
Try this:
If you want cvs file
then you finish the line print(df)
and use df.to_csv("prod.csv")
I have written in code to get csv file
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0'}
temp=[]
for page in range(1, 20):
response = requests.get("https://www.avbuyer.com/aircraft/private-jets/page-{page}".format(page=page),headers=headers,)
soup = BeautifulSoup(response.content, 'html.parser')
postings = soup.find_all('div', class_='grid-x list-content')
for post in postings:
plane = post.find('h2', class_='item-title').text
try:
price = post.find('div', class_='price').text
except:
price=" "
location = post.find('div', class_='list-item-location').text
t=post.find_all('div',class_='list-other-dtl')
for i in t:
data=[tup.text for tup in i.find_all('li')]
years=data[0]
s=data[1]
total_time=data[2]
temp.append([plane,price,location,years,s,total_time])
df=pd.DataFrame(temp,columns=["plane","price","location","Years","S/N","Totaltime"])
print(df)
output:
plane price ... S/N Totaltime
0 Gulfstream G280 Make offer ... S/N 2007 Total Time 2528
1 Dassault Falcon 2000LXS Make offer ... S/N 377 Total Time 33
2 Cirrus Vision SF50 G1 Please call ... S/N 0080 Total Time 615
3 Gulfstream IV Make offer ... S/N 1148 Total Time 6425
4 Gulfstream G280 Make offer ... S/N 2072 Total Time 1918
.. ... ... ... ... ...
342 Embraer Phenom 100 Now Sold ... S/N 50000035 Total Time 3417
343 Gulfstream G200 Now Sold ... S/N 152 Total Time 7209
344 Cessna Citation XLS Now Sold ... S/N - Total Time -
345 Cessna Citation Ultra Now Sold ... S/N 560-0393 Total Time 12947
346 Cessna Citation Excel Now Sold ... S/N 560XL-5253 Total Time 4850