a data collection with web scraping-CodePudding

I'am trying to extract data from a site and then to create a DataFrame out of it. the program doesnt work properly. I'am new in web scraping. Hope somoene help me out and find the problem.

from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'https://www.imdb.com/chart/top/?sort=rk,asc&mode=simple&page=1'

page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')

#print(soup)

film_in= soup.find('tbody').findAll('tr')

#print(film_in)
film = film_in[0]
#print(film)


titre = film.find("a",{'title':'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'})
print(titre.text)


rang = film.find("td",{'class':'ratingColumn imdbRating'}).find('strong').text
#print(rang)

def remove_parentheses(string):
    return string.replace("(","").replace(")","")


année = film.find("span",{'class':'secondaryInfo'}).text
#print(année)

imdb =[]

for films in film_in:
    titre = film.find("a",{'title':'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'})

    rang = film.find("td",{'class':'ratingColumn imdbRating'}).find('strong').text

    année =(remove_parentheses(film.find("span",{'class':'secondaryInfo'}).text))

    dictionnaire = {'film': film,
                    'rang': rang,
                    'année':année
                    }
    imdb.append(dictionnaire)

df_imdb = pd.DataFrame(imdb)
print(df_imdb)

I'am trying to extract data from a site and then to create a DataFrame out of it. the program doesnt work properly. I need to solve it using urllib, is there a way. thanks in advance I'am new in web scraping.

CodePudding user response：

You can try the next example:

    from bs4 import BeautifulSoup
    from urllib.request import urlopen
    import requests
    import pandas as pd
    
    url = 'https://www.imdb.com/chart/top/?sort=rk,asc&mode=simple&page=1'
    
    #soup = BeautifulSoup(requests.get(url).text,'html.parser')# It's the perfect and powerful 
    page = urlopen(url)
    soup = BeautifulSoup(page, 'html.parser')
    
    imdb = []
    film_in = soup.select('table[] tr')
    for film in film_in[1:]:
        titre = film.select_one('.titleColumn a').get_text(strip=True)
        rang = film.select_one('[] > strong').text
    
        année =film.find("span",{'class':'secondaryInfo'}).get_text(strip=True)
    
        dictionnaire = {'titre': titre,
                        'rang': rang,
                        'année':année
                        }
        imdb.append(dictionnaire)
    
    df_imdb = pd.DataFrame(imdb)
    print(df_imdb)

Output:

                        titre rang   année
0    The Shawshank Redemption  9.2  (1994)
1               The Godfather  9.2  (1972)
2             The Dark Knight  9.0  (2008)
3       The Godfather Part II  9.0  (1974)
4                12 Angry Men  9.0  (1957)
..                        ...  ...     ...
245               Dersu Uzala  8.0  (1975)
246                   Aladdin  8.0  (1992)
247                  The Help  8.0  (2011)
248            The Iron Giant  8.0  (1999)
249                    Gandhi  8.0  (1982)

[250 rows x 3 columns]