unique dictionary by scraping several webs-CodePudding

I am scraping several websites with scrapy and my output creates a list of dicts (one per website). I would like my output to only create one dict. I've tried to use meta but I don't understand it well and I can't get it to work.

This is my code:

class TransferSpider(scrapy.Spider):     
    # name of the spider
    name = 'transfers'
    # list of urls to scraping
    start_urls = ['https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1',
                 'https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/2']

    custom_settings={"FEEDS":{"players.json" : {"format" : "json", 'encoding':'utf-8', 'indent':4}}}
 
    
    def parse(self, response):
        # Extract all text from table
        data = response.xpath("//*[@id='yw1']/table/tbody//table//text()").extract()
        # Delete space
        data = map(str.strip, data)
        # Take no empty elements
        data = list(filter(lambda x: (x != ''), data))
        #print(data)
        yield {
            'names': data[0::6],
            'position': data[1::6],
            'origin_club': data[2::6],
            'leage_origin_club': data[3::6],
            'new_club': data[4::6],
            'leage_new_club': data[5::6]
        }

Probably the solution isn't very difficult but I can't get it

The output I want is:

{
    Names:[list whit names],
    Position:[list with positions]
...
}

CodePudding user response：

You do not specify the desired dict outcome... and nobody can stop you from using complex solutions. However, the job can be done in a straightforward manner, with python, requests, BeautifulSoup and pandas:

import requests
from bs4 import BeautifulSoup
import pandas as pd

final_list = []

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17'}

for x in range(1, 7):
    r = requests.get(f'https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/2/page/{x}', headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    big_table = soup.select('table.items>tbody>tr')
    for row in big_table:
        name = row.find_all('td',  recursive=False)[0].select('td')[1]
        position = row.find_all('td',  recursive=False)[0].select('td')[2]
        age = row.find_all('td',  recursive=False)[1]
        nationality = row.find_all('td',  recursive=False)[2].select_one('img')['alt']
        origin_club = row.find_all('td',  recursive=False)[3].select('td')[1]
        origin_club_league = row.find_all('td',  recursive=False)[3].select('td')[2]
        new_club = row.find_all('td',  recursive=False)[4].select('td')[1]
        new_club_league = row.find_all('td',  recursive=False)[4].select('td')[2]
        value_when_transferred = row.find_all('td',  recursive=False)[5]
        cost = row.find_all('td',  recursive=False)[6]
        final_list.append((name.text.strip(), age.text.strip(), 
                           position.text.strip(), nationality, 
                           origin_club.text.strip(), origin_club_league.text.strip(), 
                           new_club.text.strip(), new_club_league.text.strip(), 
                           value_when_transferred.text.strip(),cost.text.strip()))
final_df = pd.DataFrame(final_list, columns = ['Name', 'Age', 'Position', 'Nationality', 
                        'Origin Club', 'Origin Club league', 'New Club', 'New Club League', 
                        'Value when transferred', 'Cost'])
final_df

This dataframe can be made into a dict:

final_dict = final_df.to_dict()
final_dict

CodePudding user response：

According to your posted tag scrapy and output as dictionary, you can try the next example:

import scrapy
from scrapy.crawler import CrawlerProcess

class TransferSpider(scrapy.Spider):     
    name = 'transfers'
    start_urls = ['https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/1',
                 'https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/2']


    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
        }
    #custom_settings={"FEEDS":{"players.json" : {"format" : "json", 'encoding':'utf-8', 'indent':4}}}
 
    def parse(self, response):
        for tr in response.xpath('//*[@]/tbody/tr'):
           
            yield {
                'names': tr.xpath('.//td[1]/table/tr[1]/td[2]/a/text()').get(),
                'position': tr.xpath('.//td[1]/table/tr[2]/td/text()').get(),
                'origin_club':tr.xpath('.//td[4]/table/tr/td[2]/a/text()').get(),
                'leage_origin_club':tr.xpath('.//td[4]/table/tr[2]/td/a/text()').get(),
                'new_club': tr.xpath('.//td[5]/table/tr[1]/td[2]/a/text()').get(),
                'leage_new_club': tr.xpath('.//td[5]/table/tr[2]/td/a/text()').get()
            }


if __name__ == '__main__':
    process = CrawlerProcess()
    process.crawl(TransferSpider)
    process.start()

Output:

{'names': 'Neco Williams', 'position': 'Lateral derecho', 'origin_club': 'Liverpool', 'leage_origin_club': 'Premier League', 'new_club': 'Nottm Forest', 'leage_new_club': 'Premier League'}2022-07-16 18:02:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/1>
{'names': 'Omar Richards', 'position': 'Lateral izquierdo', 'origin_club': 'Bayern Múnich', 'leage_origin_club': 'Bundesliga', 'new_club': 'Nottm Forest', 'leage_new_club': 'Premier League'}
2022-07-16 18:02:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/1>
{'names': 'Hrvoje Babec', 'position': 'Pivote', 'origin_club': 'HNK Gorica', 'leage_origin_club': 'SuperSport HNL', 'new_club': 'Riga', 'leage_new_club': 'Virsliga'}
2022-07-16 18:02:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/1>
{'names': 'Henry Onyekuru', 'position': 'Extremo izquierdo', 'origin_club': 'Olympiakos', 'leage_origin_club': 'Super League 1', 'new_club': 'Adana Demirspor', 'leage_new_club': 'Süper Lig'}
2022-07-16 18:02:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/1>
{'names': 'Ricardo Grigore', 'position': 'Defensa central', 'origin_club': 'FC Dinamo', 'leage_origin_club': 'Liga 2', 'new_club': 'FC U Craiova', 'leage_new_club': 'SuperLiga'}
2022-07-16 18:02:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/1>
{'names': 'Max Kremer', 'position': 'Extremo izquierdo', 'origin_club': 'Energie Cottbus', 'leage_origin_club': 'Regionalliga Nordost', 'new_club': 'SF Lotte', 'leage_new_club': 'Oberliga 
Westfalen'}

... so on