I'm trying to get info from page1,2,3... in this page https://myanimelist.net/topanime.php?limit=0 (instead of page=1, page=2 and on, it goes like limit=0,limit=50, limit=100...
). How can I simplify the code inside de IF functions?
data = []
def main(number):
driver = webdriver.Chrome()
url = 'https://myanimelist.net/topanime.php?limit={}'
if number <= 1:
driver.get(url.format(0))
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('tr', class_= 'ranking-list')
for result in results:
Anime = result.find('h3', class_='hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3').text.replace('\n','')
Date = result.find('div', class_='information di-ib mt4').text.replace('\n','')
No_eps = result.find('div', class_='information di-ib mt4').text.replace('\n','')
Ranking = result.find('td', class_='rank ac').text.replace('\n','')
Score = result.find('div', class_='js-top-ranking-score-col di-ib al').text.replace('\n','')
data.append({
'Anime':Anime,
'Date':Date,
'No_eps':No_eps,
'Ranking':Ranking,
'Score':Score
})
elif number >= 2:
for limit in range(0,(int(number)*50), 50):
driver.get(url.format(limit))
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('tr', class_= 'ranking-list')
for result in results:
Anime = result.find('h3', class_='hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3').text.replace('\n','')
Date = result.find('div', class_='information di-ib mt4').text.replace('\n','')
No_eps = result.find('div', class_='information di-ib mt4').text.replace('\n','')
Ranking = result.find('td', class_='rank ac').text.replace('\n','')
Score = result.find('div', class_='js-top-ranking-score-col di-ib al').text.replace('\n','')
data.append({
'Anime':Anime,
'Date':Date,
'No_eps':No_eps,
'Ranking':Ranking,
'Score':Score
})
CodePudding user response:
Instead of guessing the possible next page number, i would try to extract the specific url / parameter from the button, if it exists:
if soup.select_one('a.link-blue-box.next'):
url = 'https://myanimelist.net/topanime.php' soup.select_one('a.link-blue-box.next').get('href')
else:
break
It also do not need the big gun selenium
, simplify and use requests
instead and to get stripped text may use .get_text(strip=True)
.
Example
For purpose of demonstration the limit starts from 22100 simply set it to 0 to get all results.
import requests
from bs4 import BeautifulSoup
url = 'https://myanimelist.net/topanime.php?limit=22100'
data = []
while True:
r = requests.get(url)
soup = BeautifulSoup(r.text)
for e in soup.find_all('tr', class_= 'ranking-list'):
Anime = e.h3.get_text(strip=True),
Date = e.find('div', class_='information di-ib mt4').get_text(strip=True)
No_eps = e.find('div', class_='information di-ib mt4').get_text(strip=True)
Ranking = e.find('td', class_='rank ac').get_text(strip=True)
Score = e.find('div', class_='js-top-ranking-score-col di-ib al').get_text(strip=True)
data.append({
'Anime':Anime,
'Date':Date,
'No_eps':No_eps,
'Ranking':Ranking,
'Score':Score
})
if soup.select_one('a.link-blue-box.next'):
url = 'https://myanimelist.net/topanime.php' soup.select_one('a.link-blue-box.next').get('href')
else:
break
data