I am trying to scrap Google Hot Trends. I tried to run Chrome developer tools to capture all requests, but it seems there are no requests in or out. So I tried to use selenium, But I could not get the data due to many reasons the data is variable and change constantly. Here is the code I tried:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
print(titles)
HeadlessBrowserHttpRequest(url)
CodePudding user response:
Your code looks correct.
The only point I see here you are missing is: you have to extract the texts from web elements you get here.
Also I'd prefer printing the texts one by one, not all the array at once.
As following:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)
CodePudding user response:
I managed to solve the problem with the following code:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
driver.get(target)
while True:
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("div", {"class": "card current done-typing"}):
titles = [title for title in soup.find("div", class_="card current done-typing")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)