I have a Python script based on BeautifulSoup and Selenium that scapes data from Google Maps. The page is supposed to deliver a list of results. However, when there is a single result, the page is different. How can I inform Python this happens to pass additional instructions?
My current script is:
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import time
import csv
from csv import writer
import random
import os
count = 0
print(os.getcwd())
with open('exported_data.csv', 'a ', newline='', encoding='utf-8') as f_header:
csv_header = writer(f_header)
csv_header.writerow(['Company Name', 'Rating'])
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
with open('terms_to_search.csv') as csv_file:
csv_reader = csv.reader(csv_file)
for row in csv_reader:
try:
page = browser.new_page()
term_searched = row[0]
name = []
rating = []
page.goto('https://www.google.com/maps/')
page.fill('xpath=//*[@id="searchboxinput"]', f'{row[0]}')
time.sleep(3)
page.click('xpath=//*[@id="searchbox-searchbutton"]', timeout=0)
time.sleep(3)
page.mouse.move(0,100)
while not page.is_visible('span.HlvSq'):
count = count 1
if count < 50:
page.mouse.wheel(0,500)
time.sleep(random.randint(1,2))
page.mouse.wheel(0,500)
time.sleep(random.randint(1,2))
page.mouse.wheel(0,500)
time.sleep(random.randint(1,2))
page.mouse.wheel(0,500)
time.sleep(random.randint(1,2))
page.mouse.wheel(0,500)
time.sleep(random.randint(1,2))
page.mouse.wheel(0,500)
time.sleep(random.randint(1,2))
else:
page.click('xpath=//*[@id="searchbox-searchbutton"]', timeout=60)
count = 0
time.sleep(5)
page.mouse.move(0,100)
count = 0
time.sleep(10)
html = page.inner_html('html')
soup = BeautifulSoup(html, 'html.parser')
try:
results = soup.find('div', class_='m6QErb DxyBCb kA9KIf dS8AEf ecceSd').find_all('div', class_='lI9IFe')
except:
pass
for result in results:
try:
name = result.find('div', class_='qBF1Pd fontHeadlineSmall').text.strip()
except:
pass
try:
rating = result.find('span', class_='MW4etd').text.strip()
except:
pass
data = [name, rating, term_searched]
with open('exported_data.csv', 'a ', encoding='utf-8', newline='') as f:
csv_writer = writer(f)
csv_writer.writerow(data)
f.close()
website = ''
count =1
page.close()
except:
pass
Searches (within a csv):
"Porto Alegre, RS Teather"
"Itaqui, RS Teather"
(the problem happens in the second search)
My idea is to have something like:
if page results different from the previous one:
#do this
Would appreciate any help. Thank you!
CodePudding user response:
While you are looking for list of search results, you would get zero such search result elements in a single result page
So using that logic, all you have to do is to check whether the div element that holds the list of search elements exists or not
It can be done like this in your code
html = page.inner_html('html')
soup = BeautifulSoup(html, 'html.parser')
results = soup.find('div', attrs={'class': 'm6QErb DxyBCb kA9KIf dS8AEf ecceSd', 'aria-label': True})
if results is None:
# This denotes that there's no list of search results, you can exit the code right here
...
results = results.find_all('div', class_='lI9IFe')
EDIT: The above code would only work while you are parsing the code in BeautifulSoup, it will not work in the playwright driver (my bad)
So the logic stays the same, search for the parent div, whose children are the search results arranged in a list. If it doesn't exist, then stop the code (maybe you can just skip the current iteration and move on to the next search)
And for that, all I've done is add one more check in the while loop
while page.query_selector('div.m6QErb.DxyBCb.kA9KIf.dS8AEf.ecceSd[aria-label]') and not page.is_visible('span.HlvSq'):
Therefore, Final code (Refractored some parts of it by myself)
import csv
import os
import sys
import time
from csv import writer
from random import randint
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
count = 0
print(os.getcwd())
with open('exported_data.csv', 'a ', newline='', encoding='utf-8') as f_header:
csv_header = writer(f_header)
csv_header.writerow(['Company Name', 'Rating'])
with sync_playwright() as p:
try:
browser = p.chromium.launch(headless=False)
with open('terms_to_search.csv') as csv_file:
csv_reader = csv.reader(csv_file)
for term_searched, *_ in csv_reader:
page = browser.new_page()
page.goto('https://www.google.com/maps/')
page.fill('xpath=//*[@id="searchboxinput"]', term_searched)
time.sleep(3)
page.click('xpath=//*[@id="searchbox-searchbutton"]', timeout=0)
time.sleep(3)
page.mouse.move(0,100)
count = 0
while page.query_selector('div.m6QErb.DxyBCb.kA9KIf.dS8AEf.ecceSd[aria-label]') and not page.is_visible('span.HlvSq'):
count = 1
if count < 50:
for _ in range(6):
page.mouse.wheel(0,500)
time.sleep(randint(1,2))
else:
page.click('xpath=//*[@id="searchbox-searchbutton"]', timeout=60)
time.sleep(5)
page.mouse.move(0,100)
time.sleep(10)
html = page.inner_html('body')
soup = BeautifulSoup(html, 'html.parser')
results = soup.find('div', attrs={'class': 'm6QErb DxyBCb kA9KIf dS8AEf ecceSd', 'aria-label': True})
if results is None:
continue
# ------ or ---------
# page.close()
# sys.exit()
results = results.find_all('div', class_='lI9IFe')
for result in results:
name = result.find('div', class_='qBF1Pd fontHeadlineSmall').get_text(strip=True)
rating = result.find('span', class_='MW4etd').get_text(strip=True)
data = [name, rating, term_searched]
with open('exported_data.csv', 'a ', encoding='utf-8', newline='') as f:
csv_writer = writer(f)
csv_writer.writerow(data)
count = 1
finally:
page.close()
browser.close()
Tell me if it's not working...