I was trying to do some web scraping when I found out the next problem:
These are the nested dictionaries outputs from the links I searched for:
d1 = {'Gaia Project': {'Jugadores': '1 a 4', 'Duración': '60 – 150 minutos', 'Edad': '12 ', 'Dureza': '4.37', 'Precio': '59,46€', 'Género': 'Eurogame – Mayorías', 'Editorial': 'Maldito Games', 'Diseñador/a': 'Jens Drögemüller', 'Total': '8.5', 'Aspecto / Componentes': '8', 'Diversión': '8', 'Variabilidad': '9.5', 'Originalidad': '9', 'Mecánicas': '8.5', 'Nota de lectores10 Votos': '8.5'}}
d2 = {'Churchill': {'Jugadores': '1 a 3', 'Duración': '60 – 300 minutos', 'Edad': '14 ', 'Dureza': '3.28', 'Precio': '71,96€', 'Género': 'Eurogame – Construcción de Rutas, Económico.', 'Editorial': 'GMT Games\xa0/\xa0Devir', 'Diseñador/a': 'Mark Herman', 'Total': '8.9', 'Aspecto / Componentes': '8.1', 'Interacción': '9.7', 'Variabilidad': '8', 'Originalidad': '8.7', 'Mecánicas': '9.2'}}
As you can see in d1, the last category mentions:
'Nota de lectores10 Votos': '8.5'
I would like to split into both two keys and values, so the dict would be like this (see the end):
{'Gaia Project': {'Jugadores': '1 a 4', 'Duración': '60 – 150 minutos', 'Edad': '12 ', Dureza': '4.37', 'Precio': '59,46€', 'Género': 'Eurogame – Mayorías', 'Editorial': 'Maldito Games', 'Diseñador/a': 'Jens Drögemüller', 'Total': '8.5', 'Aspecto / Componentes': '8', 'Diversión': '8', 'Variabilidad': '9.5', 'Originalidad': '9', 'Mecánicas': '8.5', 'Nota de lectores': '8.5', 'N. Votes: 10 Votos'}}
This is what I tried:
pattern_votes= r' de lectores\d.*'
if key.startswith('Nota'):
lectores = category.split(pattern_votes)
category.append(lectores[0],"N. Votes")
value.append(lectores[1])
Where category would be 'N. Votes' and value '10 Votos'.
I also tried a if(filter(pattern_votes,d1))
but nothing happened aparently.
These are the lists from category and value respectively:
category = ['Jugadores', 'Duración', 'Edad', 'Dureza', 'Precio', 'Género', 'Editorial', 'Diseñador/a', 'Total', 'Aspecto / Componentes', 'Diversión', 'Variabilidad', 'Originalidad', 'Mecánicas', 'Nota de lectores10 Votos']
value = ['1 a 4', '60 – 150 minutos', '12 ', '4.37', '59,46€', 'Eurogame – Mayorías', 'Maldito Games', 'Jens Drögemüller', '8.5', '8', '8', '9.5', '9', '8.5', '8.5']
Thank you for any help!
EDIT As Kuldeep suggested, here is my code:
In the end, the string is what I tried but didn't work.
import requests
import re
from bs4 import BeautifulSoup
import os
from collections import defaultdict
link = "https://mishigeek.com/gaia-project-resena-en-solitario/"
link2 = "https://mishigeek.com/churchill-resena-en-espanol-es-un-wargame/"
#def get_ratings(review):
# Capturo la cabecera de la petición HTTP
def get_info(link):
headers = requests.utils.default_headers()
headers.update(
{
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36',
}
)
# Me conecto a la url con .get()
sitemap_soup = requests.get(link, headers=headers)
sitemap_soup.close()
if (sitemap_soup.ok==True):
soup = BeautifulSoup(sitemap_soup.text,features="html.parser")
d= defaultdict(dict)
key=[]
category=[]
value=[]
otros=[] # Other set of category and values that will have to split.
pattern = r'-resena.*$'
pattern_votes= r' de lectores\d.*'
# Mediante los bucle for, se buscan todos los valores que coincida con el soup.select
for each_part in soup.select('figure[class*="wp-block-table"]'):
for each_part in soup.select('tr'):
otros.append(each_part.get_text())
split_items = (i.split(':') for i in otros[:8])
category, value = zip(*split_items)
category, value = map(list, (category, value))
nombre = re.sub(pattern,'',os.path.basename(link[:-1])).replace('-', ' ').title()
key.append(nombre)
category.append("Total")
for each_part in soup.select('div[class*="lets-review-block lets-review-block__final-score"]'):
value.append(each_part.get_text())
for each_part in soup.select('div[class*="lets-review-block__crit__title lr-font-h"]'):
category.append(each_part.get_text())
for each_part in soup.select('div[class*="lets-review-block__crit__score"]'):
value.append(each_part.get_text())
for k in key:
for c,v in zip(category,value):
d[k][c]=v
print(d)
print(category)
print(value)
'''
if key.startswith('Nota'):
lectores = category.split(pattern_votes)
category.append(lectores[0],"N. Votos")
value.append(lectores[1])
'''
CodePudding user response:
Let's start from the smallest problem: How to split 'Nota de lectores10 Votos' into 'Nota de lectores' and '10 Votos'. My approach is to use the itertools
library: Use takewhile
to get the part before the first digit, and dropwhile
for the part from the first digit on.
import itertools
def split_before_number(text):
"""Split text into 2 parts: before the first digit and the rest."""
def not_digit(c):
"""Return True if character c is not a digit."""
return not c.isdigit()
before = ''.join(itertools.takewhile(not_digit, text))
after = ''.join(itertools.dropwhile(not_digit, text))
return before, after
Test it:
>>> split_before_number('Nota de lectores10 Votos')
('Nota de lectores', '10 Votos')
Next, I would like to address the problem of transforming a pair of key/value into 1 or 2 pairs:
# This pair 'Jugadores': '1 a 4'
# Becomes: 'Jugadores': '1 a 4'
# This pair: 'Nota de lectores10 Votos': '8.5'
# Becomes: 'Nota de lectores': '8.5'
# and 'N. Votes': '10 Votos'
The code for that:
def split_key_and_value(key, value):
if not key.startswith("Nota"):
yield key, value
return
key1, value2 = split_before_number(key)
yield key1, value
yield "N. Votes", value2
Test it:
>>> dict(split_key_and_value('Nota de lectores10 Votos', "8.5"))
{'Nota de lectores': '8.5', 'N. Votes': '10 Votos'}
>>> dict(split_key_and_value("Jugadores", "1 a 4"))
{'Jugadores': '1 a 4'}
With those functions, we can now work on bigger problem: Tranforming the keys and values of d1
's value, which I call v1
:
def transform(dict_object):
"""Split some specific keys and values and form a new dict."""
new_dict_object = {}
for original_key, original_value in dict_object.items():
for key, value in split_key_and_value(original_key, original_value):
new_dict_object[key] = value
return new_dict_object
Test it:
>>> d1 = {'Gaia Project': {'Jugadores': '1 a 4',
'Duración': '60 – 150 minutos',
'Edad': '12 ',
'Dureza': '4.37',
'Precio': '59,46€',
'Género': 'Eurogame – Mayorías',
'Editorial': 'Maldito Games',
'Diseñador/a': 'Jens Drögemüller',
'Total': '8.5',
'Aspecto / Componentes': '8',
'Diversión': '8',
'Variabilidad': '9.5',
'Originalidad': '9',
'Mecánicas': '8.5',
'Nota de lectores10 Votos': '8.5'}}
>>> v1 = d1["Gaia Project"]
>>> transform(v1)
{'Jugadores': '1 a 4',
'Duración': '60 – 150 minutos',
'Edad': '12 ',
'Dureza': '4.37',
'Precio': '59,46€',
'Género': 'Eurogame – Mayorías',
'Editorial': 'Maldito Games',
'Diseñador/a': 'Jens Drögemüller',
'Total': '8.5',
'Aspecto / Componentes': '8',
'Diversión': '8',
'Variabilidad': '9.5',
'Originalidad': '9',
'Mecánicas': '8.5',
'Nota de lectores': '8.5',
'N. Votes': '10 Votos'}
Now that we can transform d1
value, we can apply that transformation on d1:
>>> d1 = {key: transform(value) for key, value in d1.items()}
>>> d1
{'Gaia Project': {'Jugadores': '1 a 4',
'Duración': '60 – 150 minutos',
'Edad': '12 ',
'Dureza': '4.37',
'Precio': '59,46€',
'Género': 'Eurogame – Mayorías',
'Editorial': 'Maldito Games',
'Diseñador/a': 'Jens Drögemüller',
'Total': '8.5',
'Aspecto / Componentes': '8',
'Diversión': '8',
'Variabilidad': '9.5',
'Originalidad': '9',
'Mecánicas': '8.5',
'Nota de lectores': '8.5',
'N. Votes': '10 Votos'}}
CodePudding user response:
What are our steps:
- Iterate through all movies and their properties
- Find a property with specific name
- Extract number of votes
- Update properties
Let's implement it:
import re
pattern = r"Nota de lectores(\d ).*" # our pattern to match full key and extract number of votes
for movie, properties in movies.items(): # 1
m = None
for k, v in properties.items():
if m := re.match(pattern, k): # 2, this syntax assumes python 3.8
break
if m is not None:
# 4
del properties[m.group(0)] # remove old key
properties["Nota de lectores"] = v # store previous value
properties["Votes"] = m.group(1) # 3
Notice that we cannot update properties during looping over them as we cannot change dict size during iteration.