I need to scrape the titles for all blog post articles via a Load More button as set by my desired range for i in range(1,3):
At present I'm only able to capture the titles for the first page even though i'm able to navigate to the next page using selenium.
Update:
In a previous question (How To Scrape Content With Load More Pages Using Selenium Python) by myself the pagination url was captured via:
Network Tab > Reload Page > Click Show more button > Select wp-admin/admin-ajax.php?...... Right Click Copy > Copy Link Address.
However, i do not know how to capture similar url for the site learnwoo.com/blog. I'm not sure if it uses a different technique.
Any help would be much appreciated.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
# Selenium Routine
from requests_html import HTMLSession
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
# Removes SSL Issues With Chrome
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('log-level=3')
options.add_argument('--disable-notifications')
#options.add_argument('--headless') # Comment to view browser actions
# Get website url
urls = "https://learnwoo.com/blog/"
r = requests.get(urls)
driver = webdriver.Chrome(executable_path="C:\webdrivers\chromedriver.exe",options=options)
driver.get(urls)
productlist = []
for i in range(1,3):
# Get Page Information
soup = BeautifulSoup(r.content, features='lxml')
items = soup.find_all('div', class_ = 'td_module_1')
print(f'LOOP: start [{len(items)}]')
for single_item in items:
title = single_item.find('h3').text.strip()
print('Title:', title)
product = {
'Title': title,
}
productlist.append(product)
print()
time.sleep(5)
WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH,"//a[@id='next-page-tdi_5']"))).send_keys(Keys.ENTER)
driver.close()
# Save Results
df = pd.DataFrame(productlist)
df.to_csv('Results.csv', index=False)
CodePudding user response:
Alternative solution: You can use API response to extract the desired data.From API response,I'm getting total 74 items where each page contains 6 items.
import pandas as pd
import requests
from bs4 import BeautifulSoup
params = {
'id': '',
'post_id': '0',
'slug': 'home',
'canonical_url': 'https://jooble.org/blog/',
'posts_per_page': '6',
'page': '0',
'offset': '20',
'post_type': 'post',
'repeater': 'default',
'seo_start_page': '1',
'preloaded': 'false',
'preloaded_amount': '0',
'lang': 'en',
'order': 'DESC',
'orderby': 'date',
'action': 'alm_get_posts',
'query_type': 'standard',
}
headers= {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
api_url='https://jooble.org/blog/wp-admin/admin-ajax.php'
productlist= []
for params['page'] in range(0,13):
req = requests.get(api_url,params=params,headers=headers)
e = req.json()['html']
soup = BeautifulSoup(e,'lxml')
items = soup.find_all('div', class_ = 'front__news-content-wrapper')
for single_item in items:
title = single_item.find('div', class_ = 'front__news-title')
title=title.text.strip() if title else None
product = {
'Title': title,
}
productlist.append(product)
df = pd.DataFrame(productlist)
print(df)
Output:
Title
0 How to become an anesthesiologist
1 How to Become a Flight Attendant
2 How To Become An Influencer
3 How to Become an Electrician
4 3 Common Job Scams You Should Stay Away From
.. ...
69 Exploring Main Types of Remote Work
70 14 books HR specialist should read. Part 2
71 14 books HR specialist should read. Part 1
72 Don’t do that: 7 mistakes ruining your job int...
73 Virtual job interview. Jooble tips how to nail it
[74 rows x 1 columns]
CodePudding user response:
To answer your question in selenium context
, you could call .click()
:
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH,"//a[@id='next-page-tdi_5']"))).click()
Concerning your xhr request comment - Note: Here it is not a GET
it is a POST
request (https://learnwoo.com/wp-admin/admin-ajax.php?td_theme_name=Newspaper&v=11) and you have to send some additional payload with requests
Example
This example is based on selenium 4
and uses its imports, may check https://www.selenium.dev/documentation/webdriver/getting_started/upgrade_to_selenium_4/#python-1
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
urls = "https://learnwoo.com/blog/"
driver.get(urls)
productlist = []
for i in range(1,3):
soup = BeautifulSoup(driver.page_source)
items = soup.find_all('div', class_ = 'td_module_1')
print(f'LOOP: start [{len(items)}]')
for single_item in items:
product = {
'Title': single_item.find('h3').text.strip(),
}
productlist.append(product)
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH,"//a[@id='next-page-tdi_5']"))).click()
pd.DataFrame(productlist)
Output
Title
0 5 Futuristic eCommerce Trends
1 9 Best Shopify Apps to Print Shipping Labels
2 Cloud Hosting VS VPS Hosting: A Comparison
3 10 Best WooCommerce Facebook Integration Plugins
4 Complete Guide to BigCommerce Security
... ...
91 How To Calculate ROI of Your Moodle LMS?
92 How and Where to Find Help for WordPress Begin...
93 Expert Speaks: In Conversation with Amir Helze...
94 A Complete Analysis: NetSuite WooCommerce Inte...
95 Review of Currency Switcher & Converter for Sh...
96 rows × 1 columns
CodePudding user response:
Here is an alternative to HedgHog's response: maybe the better way here is to use a while
loop, as we don't know how many entries there are. I used a counter to breakout of the loop after the 5th loading - if you want to get all those entries, you just remove the counter.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time as t
import pandas as pd
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
wait = WebDriverWait(browser, 20)
big_list = []
counter = 1
url = 'https://learnwoo.com/blog/'
browser.get(url)
while True:
try:
load_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[aria-label="load_more"]')))
load_button.click()
counter = counter 1
print('clicked to load more')
t.sleep(3)
entries = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h3[]')))
print('we have', len(entries), 'articles')
big_list = [(x.text, x.find_element(By.TAG_NAME, 'a').get_attribute('href')) for x in entries]
if counter > 5:
break
except Exception as e:
print('all done')
break
df = pd.DataFrame(big_list, columns = ['Article', 'Url'])
print(df)
Result:
Article Url
0 5 Futuristic eCommerce Trends https://learnwoo.com/future-ecommerce-trends/
1 9 Best Shopify Apps to Print Shipping Labels https://learnwoo.com/best-shopify-apps-print-s...
2 Cloud Hosting VS VPS Hosting: A Comparison https://learnwoo.com/cloud-hosting-vps-hosting/
3 10 Best WooCommerce Facebook Integration Plugins https://learnwoo.com/best-woocommerce-facebook...
4 Complete Guide to BigCommerce Security https://learnwoo.com/bigcommerce-security-guide/
... ... ...
286 A Comparison Between Omnichannel and Multichan... https://learnwoo.com/omnichannel-multichannel-...
287 8 Winning Techniques for Off-page SEO https://learnwoo.com/winning-techniques-for-of...
288 WooCommerce – How to Understand User Roles and... https://learnwoo.com/woocommerce-understand-us...
289 7 Best Free WooCommerce Catalog Mode Plugins (... https://learnwoo.com/free-woocommerce-catalog-...
290 Different WooCommerce Product Types Explained ... https://learnwoo.com/woocommerce-different-pro...
291 rows × 2 columns