out of curiosity I started goofing around with selenium and BeautifulSoup. The Code below is working quit fine for me. I only want to know if there is a better//shorter way to save the data to the dict instead of using the two seperated lists and for loops.
The code is scraping amazon for a book via the ISBN-13 number and than get some information about the book.
THIS PART OF THE CODE
<----------------------------------------------------------------------------------->
def get_item_data(user_input):
item_url = get_item_url(user_input)
driver.get(item_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# scrape html page for needed item data
results_key = soup.find_all('div',{'class':'a-section a-spacing-small a-text-center rpi-attribute-label'})
results_value = soup.find_all('div',{'class':'a-section a-spacing-none a-text-center rpi-attribute-value'})
#loop over results and extract specific data
#save extracted data to list
key_list = []
value_list = []
for data in results_key:
key = data.span.text
key_list.append(key)
for data in results_value:
value = data.span.text
value_list.append(value)
# combine lists into dict
data_dict = dict(zip(key_list,value_list)
<-------------------------------------------------------------------------->
OUTPUT
[WDM] - ====== WebDriver manager ======
[WDM] - Current google-chrome version is 94.0.4606
[WDM] - Get LATEST driver version for 94.0.4606
[WDM] - Driver [C:\Users\Ignaz\.wdm\drivers\chromedriver\win32\94.0.4606.61\chromedriver.exe] found in cache
DevTools listening on ws://127.0.0.1:50547/devtools/browser/598bf196-9909-4a5d-a47e-94e5d28f96f4
[17784:18480:1013/134257.078:ERROR:display_layout.cc(559)] PlacementList must be sorted by first 8 bits of display_id
<<------------------------------------------>>
[<div class="a-section a-spacing-small a-text-center rpi-attribute-label">
<span>Seitenzahl der Print-Ausgabe</span>
</div>, <div class="a-section a-spacing-small a-text-center rpi-attribute-label">
<span>Sprache</span>
</div>, <div class="a-section a-spacing-small a-text-center rpi-attribute-label">
<span>Herausgeber</span>
</div>, <div class="a-section a-spacing-small a-text-center rpi-attribute-label">
<span>Erscheinungstermin</span>
</div>, <div class="a-section a-spacing-small a-text-center rpi-attribute-label">
<span>Abmessungen</span>
</div>, <div class="a-section a-spacing-small a-text-center rpi-attribute-label">
<span>ISBN-10</span>
</div>, <div class="a-section a-spacing-small a-text-center rpi-attribute-label">
<span>ISBN-13</span>
<<------------------------------------------>>
</div>] [<div class="a-section a-spacing-none a-text-center rpi-attribute-value">
<span>256 Seiten</span>
</div>, <div class="a-section a-spacing-none a-text-center rpi-attribute-value">
<span>Englisch</span>
</div>, <div class="a-section a-spacing-none a-text-center rpi-attribute-value">
<span>Profile Books</span>
</div>, <div class="a-section a-spacing-none a-text-center rpi-attribute-value">
<span>3. August 2017</span>
</div>, <div class="a-section a-spacing-none a-text-center rpi-attribute-value">
<span>12.8 x 2 x 19.4 cm</span>
</div>, <div class="a-section a-spacing-none a-text-center rpi-attribute-value">
<span>1781257027</span>
</div>, <div class="a-section a-spacing-none a-text-center rpi-attribute-value">
<span>978-1781257029</span>
</div>]
#results_key list
<<------------------------------->>
['Seitenzahl der Print-Ausgabe', 'Sprache', 'Herausgeber', 'Erscheinungstermin', 'Abmessungen', 'ISBN-10', 'ISBN-13']
<<------------------------------->>
#results_value list
['256 Seiten', 'Englisch', 'Profile Books', '3. August 2017', '12.8 x 2 x 19.4 cm', '1781257027', '978-1781257029']
<<------------------------------->>
# zipped-dict
{'Seitenzahl der Print-Ausgabe': '256 Seiten', 'Sprache': 'Englisch', 'Herausgeber': 'Profile Books', 'Erscheinungstermin': '3. August 2017', 'Abmessungen': '12.8 x 2 x 19.4 cm', 'ISBN-10': '1781257027', 'ISBN-13': '978-1781257029'}
<<------------------------------->>
WHOLE CODE
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
# function to compute right url out of user input
def get_item_url(search_term):
template = 'https://www.amazon.de/s?k={}&__mk_de_DE=ÅMÅŽÕÑ&ref=nb_sb_noss'
search_term = search_term.replace("-","")
url = template.format(search_term)
# open driver and get item href
driver.get(url)
soup = BeautifulSoup(driver.page_source,'html.parser')
# scrape page for specific href of item
results = soup.find_all('h2',{'class','a-size-mini a-spacing-none a-color-base s-line-clamp-2'})
item = results[0]
item_href = item.a.get('href')
item_url = 'https://www.amazon.de' item_href
return item_url
# function to get item data
def get_item_data(user_input):
item_url = get_item_url(user_input)
driver.get(item_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# scrape html page for needed item data
results_key = soup.find_all('div',{'class':'a-section a-spacing-small a-text-center rpi-attribute-label'})
results_value = soup.find_all('div',{'class':'a-section a-spacing-none a-text-center rpi-attribute-value'})
print('<<------------------------------------------>>')
print(results_key, results_value)
print('<<------------------------------------------>>')
#loop over results and extract specific data
#save extracted data to list
key_list = []
value_list = []
for data in results_key:
key = data.span.text
key_list.append(key)
for data in results_value:
value = data.span.text
value_list.append(value)
# combine lists into dict
data_dict = dict(zip(key_list,value_list))
driver.close()
print(key_list)
print('<<------------------------------->>')
print(value_list)
print('<<------------------------------->>')
print(data_dict)
print('<<------------------------------->>')
return data_dict
data_list = get_item_data('978-3-86882-504-6')
CodePudding user response:
You can use dictionary comprehension and get a dictionary on single line of code.
data_dict = { key.span.text: val.span.text for (key, val) in zip(results_key, results_value) }