How could I save web scraping info from mulitple pages in a dataframe?-CodePudding

strong text I'm trying to get info from page1,2,3... in this page https://myanimelist.net/topanime.php?limit=0 (instead of page=1, page=2 and on, it goes like limit=0,limit=50, limit=100...). The thing is that when the code loop through the number of pages I want, it gets the info from all the pages but it only saves the info from the last one in the new csv file.

def main(number):

driver = webdriver.Chrome()
url = 'https://myanimelist.net/topanime.php?limit={}'


if number <= 1:
    return url.format(0)
elif number >= 2:
    for limit in range(0,(int(number)*50), 50):
        driver.get(url.format(limit))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('tr', class_= 'ranking-list')

        with open('MAL_topanime.csv', 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            header = ['Anime','Date', 'No_eps', 'Ranking', 'Score']
            writer.writerow(header)
                     
        
            for result in results:
                Anime = result.find('h3', class_='hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3').text.replace('\n','')
                Date = result.find('div', class_='information di-ib mt4').text.replace('\n','')
                No_eps = result.find('div', class_='information di-ib mt4').text.replace('\n','')
                Ranking = result.find('td', class_='rank ac').text.replace('\n','')
                Score = result.find('div', class_='js-top-ranking-score-col di-ib al').text.replace('\n','')
                info = [Anime, Date, No_eps, Ranking,Score]
                writer.writerow(info)

CodePudding user response：

You can try the next example

import requests
from bs4 import BeautifulSoup
import pandas as pd

data = []
for limit in range(0,150,50):
    r = requests.get(f'https://myanimelist.net/topanime.php?limit={limit}')
    soup = BeautifulSoup(r.content, 'html.parser')

    results = soup.find_all('tr', class_= 'ranking-list')
    for result in results:
                Anime = result.find('h3', class_='hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3').text.replace('\n','')
                Date = result.find('div', class_='information di-ib mt4').text.replace('\n','')
                No_eps = result.find('div', class_='information di-ib mt4').text.replace('\n','')
                Ranking = result.find('td', class_='rank ac').text.replace('\n','')
                Score = result.find('div', class_='js-top-ranking-score-col di-ib al').text.replace('\n','')

                data.append({
                    'Anime':Anime,
                    'Date':Date,
                    'No_eps':No_eps,
                    'Ranking':Ranking,
                    'Score':Score
                })
df = pd.DataFrame(data)
print(df)

Output:

                 Anime  ... Score
0                     Fullmetal Alchemist: Brotherhood  ...  9.12
1                            Bleach: Sennen Kessen-hen  ...  9.11
2           Kaguya-sama wa Kokurasetai: Ultra Romantic  ...  9.10
3                                             Gintama°  ...  9.08
4                                          Steins;Gate  ...  9.08
..                                                 ...  ...   ...
145                 Mushishi Zoku Shou: Odoro no Michi  ...  8.44
146                  Saenai Heroine no Sodatekata Fine  ...  8.44
147                       Wu Liuqi Zhi Xuanwu Guo Pian  ...  8.44
148  JoJo no Kimyou na Bouken Part 3: Stardust Crus...  ...  8.44
149        Gintama: Yorinuki Gintama-san on Theater 2D  ...  8.43

[150 rows x 5 columns]