Please I need your help with this. I working on a scraping project.
Here's the web link that I am trying to scrap the Outputs tab of this address: https://www.napier.ac.uk/research-and-innovation/research-search?fv=BE7C946393454607AD2CC3D91E65303F~Business School&dtFrom=2021-01&dtTo=2022-12&t1sz=100&tab=1&tabpg1=3#rms
I am able to scrape the firstpage and the nextpage(which could be page 2, or 3) but I'm unable to iterate through the pages. Maybe because there's no nextpage button on the HTML tag.
This is what I have done
output_tab = wait.until(ec.element_to_be_clickable((By.XPATH, "(//a[@class='r-tabs-anchor'][normalize-space()='Outputs'])[1]")))
output_tab.click()
time.sleep(2)
df = pd.DataFrame({'Titles': [''], 'SubTitle': [''], 'Abstract': [''], 'Intro': [''], 'Links': ['']})
counter = 0
while counter < 4:
driver.refresh()
post_blocks = driver.find_elements(By.XPATH, "(//div[@class='output bgGrey'])")
for post_block in post_blocks:
title = post_block.find_element(By.XPATH, "./div/h3").text # (//div[@class='output bgGrey'])/div/h3
sub_title = post_block.find_element(By.XPATH, "./div[3]").text # (//div[@class='output bgGrey'])/div[3]
try:
post_abstract = post_block.find_element(By.XPATH, "./div[4]").text # (//div[@class='output bgGrey'])/div[4]
except Exception:
continue
try:
post_intro = post_block.find_element(By.XPATH, "./div[5]").text # # (//div[@class='output bgGrey'])/div[5]
except Exception:
continue
post_link = post_block.find_element(By.XPATH, "./parent::a").get_attribute('href') # (//div[@class='output bgGrey'])/parent::a
df = df.append({'Titles': title, 'SubTitle': sub_title, 'Abstract': post_abstract, 'Intro': post_intro, 'Links': post_link}, ignore_index = True)
next_page = wait.until(ec.element_to_be_clickable((By.XPATH, "(//a[contains(@class,'')][normalize-space()='3'])[2]"))).click()
counter = 1
df.to_csv('C:/Users/testuser/napier_outputs.csv')
Error Noticed
TimeoutException: TimedPromise timed out after 300000 ms
I can easily scrap the pages by changing the value in this XPATH (//a[contains(@class,'')][normalize-space()='3'])[2]
from 3 to 4. And so on.
In summary, how can I iterate through the pages and collect the data as I have done for the first page?
CodePudding user response:
You could determine the number of pages at the beginning by analyzing the link to the "Last" page. This <a>
element's href
attribute contains the the query parameter tabpg1
whichs value is equal to the number of pages.
With this knowledge, you can now retrieve all pages by defining a base url to which you add the counter. Check this code:
import urllib.parse
wait = WebDriverWait(driver,3)
counter = 1
base_url = "https://www.napier.ac.uk/research-and-innovation/research-search?fv=BE7C946393454607AD2CC3D91E65303F~Business School&dtFrom=2021-01&dtTo=2022-12&t1sz=100&tab=1&tabpg1="
driver.get(f"{base_url}{counter}")
output_tab = wait.until(ec.element_to_be_clickable((By.XPATH, "(//a[@class='r-tabs-anchor'][normalize-space()='Outputs'])[1]")))
output_tab.click()
time.sleep(2)
df = pd.DataFrame({'Titles': [], 'SubTitle': [], 'Abstract': [], 'Intro': [], 'Links': []})
# get number of pages
last_link = driver.find_element(By.ID, "bodycontent_1_ctl07_lnkLast")
url_parts = urllib.parse.urlparse(last_link.get_attribute("href"))
last_page = int(urllib.parse.parse_qs(url_parts.query)["tabpg1"][0])
while counter <= last_page:
driver.refresh()
post_blocks = driver.find_elements(By.XPATH, "(//div[@class='output bgGrey'])")
for post_block in post_blocks:
title = post_block.find_element(By.XPATH, "./div/h3").text # (//div[@class='output bgGrey'])/div/h3
sub_title = post_block.find_element(By.XPATH, "./div[3]").text # (//div[@class='output bgGrey'])/div[3]
try:
post_abstract = post_block.find_element(By.XPATH, "./div[4]").text # (//div[@class='output bgGrey'])/div[4]
except Exception:
continue
try:
post_intro = post_block.find_element(By.XPATH, "./div[5]").text # # (//div[@class='output bgGrey'])/div[5]
except Exception:
continue
post_link = post_block.find_element(By.XPATH, "./parent::a").get_attribute('href') # (//div[@class='output bgGrey'])/parent::a
df = df.append({'Titles': title, 'SubTitle': sub_title, 'Abstract': post_abstract, 'Intro': post_intro, 'Links': post_link}, ignore_index = True)
counter = 1
driver.get(f"{base_url}{counter}")
df.to_csv('napier_outputs.csv')