I am scraping several websites with scrapy and my output creates a list of dicts (one per website). I would like my output to only create one dict. I've tried to use meta but I don't understand it well and I can't get it to work.
This is my code:
class TransferSpider(scrapy.Spider):
# name of the spider
name = 'transfers'
# list of urls to scraping
start_urls = ['https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1',
'https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/2']
custom_settings={"FEEDS":{"players.json" : {"format" : "json", 'encoding':'utf-8', 'indent':4}}}
def parse(self, response):
# Extract all text from table
data = response.xpath("//*[@id='yw1']/table/tbody//table//text()").extract()
# Delete space
data = map(str.strip, data)
# Take no empty elements
data = list(filter(lambda x: (x != ''), data))
#print(data)
yield {
'names': data[0::6],
'position': data[1::6],
'origin_club': data[2::6],
'leage_origin_club': data[3::6],
'new_club': data[4::6],
'leage_new_club': data[5::6]
}
Probably the solution isn't very difficult but I can't get it
The output I want is:
{
Names:[list whit names],
Position:[list with positions]
...
}
CodePudding user response:
You do not specify the desired dict outcome... and nobody can stop you from using complex solutions. However, the job can be done in a straightforward manner, with python, requests, BeautifulSoup and pandas:
import requests
from bs4 import BeautifulSoup
import pandas as pd
final_list = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17'}
for x in range(1, 7):
r = requests.get(f'https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/2/page/{x}', headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
big_table = soup.select('table.items>tbody>tr')
for row in big_table:
name = row.find_all('td', recursive=False)[0].select('td')[1]
position = row.find_all('td', recursive=False)[0].select('td')[2]
age = row.find_all('td', recursive=False)[1]
nationality = row.find_all('td', recursive=False)[2].select_one('img')['alt']
origin_club = row.find_all('td', recursive=False)[3].select('td')[1]
origin_club_league = row.find_all('td', recursive=False)[3].select('td')[2]
new_club = row.find_all('td', recursive=False)[4].select('td')[1]
new_club_league = row.find_all('td', recursive=False)[4].select('td')[2]
value_when_transferred = row.find_all('td', recursive=False)[5]
cost = row.find_all('td', recursive=False)[6]
final_list.append((name.text.strip(), age.text.strip(),
position.text.strip(), nationality,
origin_club.text.strip(), origin_club_league.text.strip(),
new_club.text.strip(), new_club_league.text.strip(),
value_when_transferred.text.strip(),cost.text.strip()))
final_df = pd.DataFrame(final_list, columns = ['Name', 'Age', 'Position', 'Nationality',
'Origin Club', 'Origin Club league', 'New Club', 'New Club League',
'Value when transferred', 'Cost'])
final_df
This dataframe can be made into a dict:
final_dict = final_df.to_dict()
final_dict
CodePudding user response:
According to your posted tag scrapy
and output as dictionary, you can try the next example:
import scrapy
from scrapy.crawler import CrawlerProcess
class TransferSpider(scrapy.Spider):
name = 'transfers'
start_urls = ['https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/1',
'https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/2']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
#custom_settings={"FEEDS":{"players.json" : {"format" : "json", 'encoding':'utf-8', 'indent':4}}}
def parse(self, response):
for tr in response.xpath('//*[@]/tbody/tr'):
yield {
'names': tr.xpath('.//td[1]/table/tr[1]/td[2]/a/text()').get(),
'position': tr.xpath('.//td[1]/table/tr[2]/td/text()').get(),
'origin_club':tr.xpath('.//td[4]/table/tr/td[2]/a/text()').get(),
'leage_origin_club':tr.xpath('.//td[4]/table/tr[2]/td/a/text()').get(),
'new_club': tr.xpath('.//td[5]/table/tr[1]/td[2]/a/text()').get(),
'leage_new_club': tr.xpath('.//td[5]/table/tr[2]/td/a/text()').get()
}
if __name__ == '__main__':
process = CrawlerProcess()
process.crawl(TransferSpider)
process.start()
Output:
{'names': 'Neco Williams', 'position': 'Lateral derecho', 'origin_club': 'Liverpool', 'leage_origin_club': 'Premier League', 'new_club': 'Nottm Forest', 'leage_new_club': 'Premier League'}2022-07-16 18:02:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/1>
{'names': 'Omar Richards', 'position': 'Lateral izquierdo', 'origin_club': 'Bayern Múnich', 'leage_origin_club': 'Bundesliga', 'new_club': 'Nottm Forest', 'leage_new_club': 'Premier League'}
2022-07-16 18:02:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/1>
{'names': 'Hrvoje Babec', 'position': 'Pivote', 'origin_club': 'HNK Gorica', 'leage_origin_club': 'SuperSport HNL', 'new_club': 'Riga', 'leage_new_club': 'Virsliga'}
2022-07-16 18:02:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/1>
{'names': 'Henry Onyekuru', 'position': 'Extremo izquierdo', 'origin_club': 'Olympiakos', 'leage_origin_club': 'Super League 1', 'new_club': 'Adana Demirspor', 'leage_new_club': 'Süper Lig'}
2022-07-16 18:02:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/1>
{'names': 'Ricardo Grigore', 'position': 'Defensa central', 'origin_club': 'FC Dinamo', 'leage_origin_club': 'Liga 2', 'new_club': 'FC U Craiova', 'leage_new_club': 'SuperLiga'}
2022-07-16 18:02:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/2022-07-10/sort//plus/1/page/1>
{'names': 'Max Kremer', 'position': 'Extremo izquierdo', 'origin_club': 'Energie Cottbus', 'leage_origin_club': 'Regionalliga Nordost', 'new_club': 'SF Lotte', 'leage_new_club': 'Oberliga
Westfalen'}
... so on