Metacritic Scraping: How to Properly Extract Developer Data-CodePudding

I'm currently trying to create a dataset that contains specific Metacritic game data. Firstly, I get the list of all game URLs which then are exported as a .csv file, and then I run the next script for scraping and get all the data as the .xlsx file.

Now I need to modify the script so it can also get the developer data in addition to the publisher.

Should it be something like this?

developer = soup.find('div', class_="details side_details").find('span', class_="label")

This is what I'm seeing on Meta:

Script for getting links:

import urllib.request
import csv
import os
import time
from bs4 import BeautifulSoup
from user_agent import generate_user_agent

filepath='gamelinks.csv'

file_exists = os.path.isfile(filepath)
if (file_exists):
    os.remove(filepath)

metacritic_base = "http://www.metacritic.com/browse/games/release-date/available/pc/metascore?view=detailed&page="
hdr= {'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': generate_user_agent(device_type="desktop", os=('mac', 'linux', 'win'))}

page_start = 0
page_end = 40

for i in range(page_start,page_end):
    print("Scraping Page {} - {} Pages Left".format(i, page_end - (i 1)))
    #
    links= []
    metacritic = metacritic_base str(i)
    page = urllib.request.Request(metacritic, headers=hdr)
    content = urllib.request.urlopen(page).read()
    soup = BeautifulSoup(content, 'html.parser')

    right_class=soup.find_all('div', class_='browse_list_wrapper')
    for item in right_class:
        try:
            hrefs = item.find_all('a', class_="title", href=True)
            for it in hrefs:
                link = it['href']
                links.append(link)
        except: pass


    with open(filepath, 'a') as output:
        writer = csv.writer(output, lineterminator='\n')
        for val in links:
            writer.writerow([val])

    time.sleep(1)
#
print("Im done.")

Script for scraping:

import urllib.request
import csv
import os
from bs4 import BeautifulSoup
from user_agent import generate_user_agent
import json
import time
import xlsxwriter

import pprint
pp = pprint.PrettyPrinter(width=80, compact=True)

## Link File
filepath='gamelinks.csv'

file_exists = os.path.isfile(filepath)
if (file_exists is False):
    print("Wrong filepath.!")

links = []
with open(filepath, 'r') as input:
    reader = csv.reader(input)
    for r in reader:
        links.append(r[0])
## File

## Xlsx
filepath = 'gameDataset.xlsx'

file_exists = os.path.isfile(filepath)
if (file_exists):
    os.remove(filepath)

workbook = xlsxwriter.Workbook('gameDataset.xlsx')
worksheet = workbook.add_worksheet()

worksheet.write(0, 0, "Name")
worksheet.write(0, 1, "Release Date")
worksheet.write(0, 2, "Genre")
worksheet.write(0, 3, "Publisher")
worksheet.write(0, 4, "Meta Score")
worksheet.write(0, 5, "Total Criticism")
worksheet.write(0, 6, "User Rate")
worksheet.write(0, 7, "User Rate Count")

row = 1
## Xlsx

metacritic_base = "http://www.metacritic.com"
hdr= {'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': generate_user_agent(device_type="desktop", os=('mac', 'linux', 'win'))}


count = 1
exception_list = []

for link in links:
    print("Scraping Game {} - {} Games Left".format(count, len(links)-count))
    #
    metacritic = metacritic_base link

    try:
        page = urllib.request.Request(metacritic, headers=hdr)
        content = urllib.request.urlopen(page).read()
        soup = BeautifulSoup(content, 'html.parser')

        data = json.loads(soup.find('script', type='application/ld json').text)


        cl_count = soup.find('div', class_="userscore_wrap").find('span', class_="count")
        user_rate_count = cl_count.find('a').text.replace(' Ratings', '')

        user_rating = soup.find('div', class_="user").text
        rating_count = data['aggregateRating']['ratingCount']
        rating_value = data['aggregateRating']['ratingValue']
        date = data['datePublished']
        genre_list = data['genre']
        name = data['name']

        publishers_list = []
        publishers = data['publisher']
        for pb in publishers:
            publishers_list.append(pb['name'])

        
        worksheet.write(row, 0, name)
        worksheet.write(row, 1, date)
        worksheet.write(row, 2, ", ".join(genre_list))
        worksheet.write(row, 3, ", ".join(publishers_list))
        worksheet.write(row, 4, rating_value)
        worksheet.write(row, 5, rating_count)
        worksheet.write(row, 6, user_rating)
        worksheet.write(row, 7, user_rate_count)
        row  = 1

        #
        #time.sleep(2)
    except BaseException as e:
            exception_list.append("On game link {}, Error : {}".format(count,str(e)))

    count  = 1

workbook.close()

if(len(exception_list) > 0):
    filepath = "exceptions"
    file_exists = os.path.isfile(filepath)
    if (file_exists):
        os.remove(filepath)

    with open(filepath, 'a') as output:
        writer = csv.writer(output, lineterminator='\n')
        for e in exception_list:
            writer.writerow([e])

CodePudding user response：

Note: Please focus in new questions and provide just a mcve

How to select the developer?

You can go with find() or with css selectors - Select the element with class named developer and containes the <a>:

soup.select_one('.developer a').text

Example

import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
r = requests.get('https://www.metacritic.com/game/playstation-5/farming-simulator-22',headers=headers)
soup = BeautifulSoup(r.content, 'lxml')

soup.select_one('.developer a').text

Output

Giants Software

EDIT

Focus - Question mainly deals with extracting the developer, but I will also provide a solution to create your excel file, but with pandas.

Example

import urllib.request
from bs4 import BeautifulSoup
import json
import pandas as pd

metacritic_base = "http://www.metacritic.com"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
links = ['/game/playstation-5/farming-simulator-22','/game/playstation-5/grand-theft-auto-the-trilogy---the-definitive-edition']

data_list = []
exception_list = []

for count,link in enumerate(links):

    metacritic = metacritic_base link
    print(metacritic)
    try:
        page = urllib.request.Request(metacritic, headers=headers)
        content = urllib.request.urlopen(page).read()
        soup = BeautifulSoup(content, 'html.parser')

        data = json.loads(soup.find('script', type='application/ld json').text)

        data_list.append({
            'Name' : data['name'],
            'Release Date' : data['datePublished'],
            'Genre' : ", ".join(data['genre']),
            'Publisher' : ", ".join([x['name'] for x in data['publisher']]),
            'Developer' : soup.select_one('.developer a').text,
            'Meta Score' : data['aggregateRating']['ratingValue'],
            'Total Criticism' : data['aggregateRating']['ratingCount'],
            'User Rates' : soup.find('div', class_="user").text,
            'User Rating Count' : soup.select_one('.userscore_wrap a').get_text(strip=True).replace(' Ratings', '')
            
            
        })

    except BaseException as e:
            exception_list.append("On game link {}, Error : {}".format(count,str(e)))

# will give you a data frame, what wil give you the excel file if comment out the .to_excel...
pd.DataFrame(data_list)#.to_excel('gameDataset.xlsx', index=False)

Output excel file

Name	Release Date	Genre	Publisher	Developer	Meta Score	Total Criticism	User Rates	User Rating Count
Farming Simulator 22	November 22, 2021	Simulation, General	Giants Software, Solutions 2 GO	Giants Software	78	4	6.8	6.8
Grand Theft Auto: The Trilogy - The Definitive Edition	November 11, 2021	Miscellaneous, Compilation	Rockstar Games	Rockstar Games	56	38	0.9	0.9