Home > Enterprise >  How to scrap google hot trend
How to scrap google hot trend

Time:02-22

I am trying to scrap Google Hot Trends. I tried to run Chrome developer tools to capture all requests, but it seems there are no requests in or out. So I tried to use selenium, But I could not get the data due to many reasons the data is variable and change constantly. Here is the code I tried:

from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup

options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"


def HeadlessBrowserHttpRequest(target: str) -> str:
    driver = webdriver.Chrome(
        options=options, executable_path=os.path.abspath("chromedriver")
    )
    while True:
        driver.get(target)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        if soup.find("tile"):
            titles = [title for title in soup.find("div", class_="tile")]
            if len(titles) > 0:

                print(titles)


HeadlessBrowserHttpRequest(url)

CodePudding user response:

Your code looks correct.
The only point I see here you are missing is: you have to extract the texts from web elements you get here.
Also I'd prefer printing the texts one by one, not all the array at once.
As following:

from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup

options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"


def HeadlessBrowserHttpRequest(target: str) -> str:
    driver = webdriver.Chrome(
        options=options, executable_path=os.path.abspath("chromedriver")
    )
    while True:
        driver.get(target)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        if soup.find("tile"):
            titles = [title for title in soup.find("div", class_="tile")]
            if len(titles) > 0:
                for title in titles:
                    print(title.text)


HeadlessBrowserHttpRequest(url)

CodePudding user response:

I managed to solve the problem with the following code:

from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup

options = options.Options()
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"


def HeadlessBrowserHttpRequest(target: str) -> str:
    driver = webdriver.Chrome(
        options=options, executable_path=os.path.abspath("chromedriver")
    )
    driver.get(target)
    while True:
        
        soup = BeautifulSoup(driver.page_source, "html.parser")
        if soup.find("div", {"class": "card current done-typing"}):
            titles = [title for title in soup.find("div", class_="card current done-typing")]
            
            if len(titles) > 0:
                for title in titles:
                    print(title.text)


HeadlessBrowserHttpRequest(url)
  • Related