I'am trying to extract data from a site and then to create a DataFrame out of it. the program doesnt work properly. I'am new in web scraping. Hope somoene help me out and find the problem.
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.imdb.com/chart/top/?sort=rk,asc&mode=simple&page=1'
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
#print(soup)
film_in= soup.find('tbody').findAll('tr')
#print(film_in)
film = film_in[0]
#print(film)
titre = film.find("a",{'title':'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'})
print(titre.text)
rang = film.find("td",{'class':'ratingColumn imdbRating'}).find('strong').text
#print(rang)
def remove_parentheses(string):
return string.replace("(","").replace(")","")
année = film.find("span",{'class':'secondaryInfo'}).text
#print(année)
imdb =[]
for films in film_in:
titre = film.find("a",{'title':'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'})
rang = film.find("td",{'class':'ratingColumn imdbRating'}).find('strong').text
année =(remove_parentheses(film.find("span",{'class':'secondaryInfo'}).text))
dictionnaire = {'film': film,
'rang': rang,
'année':année
}
imdb.append(dictionnaire)
df_imdb = pd.DataFrame(imdb)
print(df_imdb)
I'am trying to extract data from a site and then to create a DataFrame out of it. the program doesnt work properly. I need to solve it using urllib, is there a way. thanks in advance I'am new in web scraping.
CodePudding user response:
You can try the next example:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import pandas as pd
url = 'https://www.imdb.com/chart/top/?sort=rk,asc&mode=simple&page=1'
#soup = BeautifulSoup(requests.get(url).text,'html.parser')# It's the perfect and powerful
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
imdb = []
film_in = soup.select('table[] tr')
for film in film_in[1:]:
titre = film.select_one('.titleColumn a').get_text(strip=True)
rang = film.select_one('[] > strong').text
année =film.find("span",{'class':'secondaryInfo'}).get_text(strip=True)
dictionnaire = {'titre': titre,
'rang': rang,
'année':année
}
imdb.append(dictionnaire)
df_imdb = pd.DataFrame(imdb)
print(df_imdb)
Output:
titre rang année
0 The Shawshank Redemption 9.2 (1994)
1 The Godfather 9.2 (1972)
2 The Dark Knight 9.0 (2008)
3 The Godfather Part II 9.0 (1974)
4 12 Angry Men 9.0 (1957)
.. ... ... ...
245 Dersu Uzala 8.0 (1975)
246 Aladdin 8.0 (1992)
247 The Help 8.0 (2011)
248 The Iron Giant 8.0 (1999)
249 Gandhi 8.0 (1982)
[250 rows x 3 columns]