Home > Software design >  Loop through URL using Python
Loop through URL using Python

Time:09-09

I have looked at a few questions but none of the answers seem to fit. I am building a webscraper tool as a personal project. I have figured out the loops to get rider data for the Vuelta 2022 however I need to loop through all the urls for each stage. For some reason, the url loop is taking the last number in the range. My gut feeling is the formatting so I am trying to play around with that but no luck

import requests
from bs4 import BeautifulSoup
import pandas as pd

for j in range (1,10):
    url = (f"https://www.lavuelta.es/en/rankings/stage-{j}")
    page = requests.get(url)
    urlt = page.content
    soup = BeautifulSoup(urlt)
    rider_rank_list = []
for i in range (1,11):
#create list of riders
    results = soup.select_one(f"body > main > div > section.ranking.classements > div > div > div.js-tabs-wrapper.js-tabs-bigwrapper > div > div > div > div > div.js-spinner-wrapper > div > div.sticky-scroll > table > tbody > tr:nth-child({i}) > td.runner.is-sticky > a ")

        
#create rider rank list
    rrank = soup.select_one(f"body > main > div > section.ranking.classements > div > div > div.js-tabs-wrapper.js-tabs-bigwrapper > div > div > div > div > div.js-spinner-wrapper > div > div.sticky-scroll > table > tbody > tr:nth-child({i}) > td:nth-child(1)")


#create stage name
    stage = str.replace(str.title(url.rsplit('/', 1)[-1]),'-',' ')

    rider_rank_list.append((str(stage),str.strip(results.text), str.strip(rrank.text)))


    
print(rider_rank_list)
df = pd.DataFrame(rider_rank_list, columns=['stage','rider','rank'], index=None)
print(df)

df.to_csv('data.csv', index=False)


CodePudding user response:

All data is in a single table. so there is no next pages option. You can use pandas DataFrame only as they are static HTML DOM

import pandas as pd
url = "https://www.lavuelta.es/en/rankings/stage#"
df = pd.read_html(url)[0]
print(df)

Output:

    Rank                      Rider  Rider No.  ...             Gap         B  P
0       1            REMCO EVENEPOEL        134  ...               -  B : 10''  -
1       2                  ENRIC MAS        124  ...    00h 00' 02''   B : 6''  -
2       3              ROBERT GESINK          4  ...    00h 00' 02''   B : 4''  -
3       4                JAI HINDLEY         44  ...    00h 00' 13''         -  -
4       5            THYMEN ARENSMAN        151  ...    00h 00' 13''         -  -
..    ...                        ...        ...  ...             ...       ... ..
129   130            KENNY ELISSONDE        163  ...    00h 33' 24''         -  -
130   131             DAVIDE CIMOLAI         53  ...    00h 33' 31''         -  -
131   132                ALEX KIRSCH        165  ...    00h 35' 17''         -  -
132   133              MADS PEDERSEN        167  ...    00h 35' 17''         -  -
133   134  IVO MANUEL ALVES OLIVEIRA        173  ...    00h 35' 48''         -  -

[134 rows x 8 columns]

CodePudding user response:

fixed indentation, with small changes

import requests
from bs4 import BeautifulSoup
import pandas as pd

rider_rank_list = []

for j in range (1,10):
    url = (f"https://www.lavuelta.es/en/rankings/stage-{j}")
    page = requests.get(url)
    urlt = page.content
    soup = BeautifulSoup(urlt)
    
    for i in range (1,11):
        #create list of riders
        results = soup.select_one(f"body > main > div > section.ranking.classements > div > div > div.js-tabs-wrapper.js-tabs-bigwrapper > div > div > div > div > div.js-spinner-wrapper > div > div.sticky-scroll > table > tbody > tr:nth-child({i}) > td.runner.is-sticky > a ")

        if results != None: 
        
            #create rider rank list
            rrank = soup.select_one(f"body > main > div > section.ranking.classements > div > div > div.js-tabs-wrapper.js-tabs-bigwrapper > div > div > div > div > div.js-spinner-wrapper > div > div.sticky-scroll > table > tbody > tr:nth-child({i}) > td:nth-child(1)")

            #create stage name
            stage = str.replace(str.title(url.rsplit('/', 1)[-1]),'-',' ')
        
            rider_rank_list.append((str(stage),str.strip(results.text), str.strip(rrank.text)))


    
print(rider_rank_list)
df = pd.DataFrame(rider_rank_list, columns=['stage','rider','rank'], index=None)
print(df)

df.to_csv('data.csv', index=False)
  • Related