I have looked at a few questions but none of the answers seem to fit. I am building a webscraper tool as a personal project. I have figured out the loops to get rider data for the Vuelta 2022 however I need to loop through all the urls for each stage. For some reason, the url loop is taking the last number in the range. My gut feeling is the formatting so I am trying to play around with that but no luck
import requests
from bs4 import BeautifulSoup
import pandas as pd
for j in range (1,10):
url = (f"https://www.lavuelta.es/en/rankings/stage-{j}")
page = requests.get(url)
urlt = page.content
soup = BeautifulSoup(urlt)
rider_rank_list = []
for i in range (1,11):
#create list of riders
results = soup.select_one(f"body > main > div > section.ranking.classements > div > div > div.js-tabs-wrapper.js-tabs-bigwrapper > div > div > div > div > div.js-spinner-wrapper > div > div.sticky-scroll > table > tbody > tr:nth-child({i}) > td.runner.is-sticky > a ")
#create rider rank list
rrank = soup.select_one(f"body > main > div > section.ranking.classements > div > div > div.js-tabs-wrapper.js-tabs-bigwrapper > div > div > div > div > div.js-spinner-wrapper > div > div.sticky-scroll > table > tbody > tr:nth-child({i}) > td:nth-child(1)")
#create stage name
stage = str.replace(str.title(url.rsplit('/', 1)[-1]),'-',' ')
rider_rank_list.append((str(stage),str.strip(results.text), str.strip(rrank.text)))
print(rider_rank_list)
df = pd.DataFrame(rider_rank_list, columns=['stage','rider','rank'], index=None)
print(df)
df.to_csv('data.csv', index=False)
CodePudding user response:
All data is in a single table. so there is no next pages option. You can use pandas DataFrame only as they are static HTML DOM
import pandas as pd
url = "https://www.lavuelta.es/en/rankings/stage#"
df = pd.read_html(url)[0]
print(df)
Output:
Rank Rider Rider No. ... Gap B P
0 1 REMCO EVENEPOEL 134 ... - B : 10'' -
1 2 ENRIC MAS 124 ... 00h 00' 02'' B : 6'' -
2 3 ROBERT GESINK 4 ... 00h 00' 02'' B : 4'' -
3 4 JAI HINDLEY 44 ... 00h 00' 13'' - -
4 5 THYMEN ARENSMAN 151 ... 00h 00' 13'' - -
.. ... ... ... ... ... ... ..
129 130 KENNY ELISSONDE 163 ... 00h 33' 24'' - -
130 131 DAVIDE CIMOLAI 53 ... 00h 33' 31'' - -
131 132 ALEX KIRSCH 165 ... 00h 35' 17'' - -
132 133 MADS PEDERSEN 167 ... 00h 35' 17'' - -
133 134 IVO MANUEL ALVES OLIVEIRA 173 ... 00h 35' 48'' - -
[134 rows x 8 columns]
CodePudding user response:
fixed indentation, with small changes
import requests
from bs4 import BeautifulSoup
import pandas as pd
rider_rank_list = []
for j in range (1,10):
url = (f"https://www.lavuelta.es/en/rankings/stage-{j}")
page = requests.get(url)
urlt = page.content
soup = BeautifulSoup(urlt)
for i in range (1,11):
#create list of riders
results = soup.select_one(f"body > main > div > section.ranking.classements > div > div > div.js-tabs-wrapper.js-tabs-bigwrapper > div > div > div > div > div.js-spinner-wrapper > div > div.sticky-scroll > table > tbody > tr:nth-child({i}) > td.runner.is-sticky > a ")
if results != None:
#create rider rank list
rrank = soup.select_one(f"body > main > div > section.ranking.classements > div > div > div.js-tabs-wrapper.js-tabs-bigwrapper > div > div > div > div > div.js-spinner-wrapper > div > div.sticky-scroll > table > tbody > tr:nth-child({i}) > td:nth-child(1)")
#create stage name
stage = str.replace(str.title(url.rsplit('/', 1)[-1]),'-',' ')
rider_rank_list.append((str(stage),str.strip(results.text), str.strip(rrank.text)))
print(rider_rank_list)
df = pd.DataFrame(rider_rank_list, columns=['stage','rider','rank'], index=None)
print(df)
df.to_csv('data.csv', index=False)