I tried to extract data from below site but I don't know how to put the xpath in the loop "for", because the loop needs to be convert xpath to str, could you do me a favor and help me:
from selenium import webdriver
import pandas as pd
import time
driver = webdriver.Chrome('C:\Webdriver\chromedriver.exe')
driver.get('https://www150.statcan.gc.ca/n1/pub/71-607-x/71-607-x2021004-eng.htm')
time.sleep(2)
# finding the button using ID
button = driver.find_element_by_xpath('//*[@id="cimt_import"]/p[1]/a')
# clicking on the button
button.click()
time.sleep(2)
# finding the button using ID
button = driver.find_element_by_xpath('//*[@id="topic3s"]')
# clicking on the button
button.click()
time.sleep(2)
# finding the start year:2022 from scroll
element_drop_down_startYear = driver.find_element_by_xpath('//*[@id="fromYear"]/option[1]')
element_drop_down_startYear.click()
# finding the start month from:January scroll
element_drop_down_startMonth = driver.find_element_by_xpath('//*[@id="fromMonth"]/option[1]')
element_drop_down_startMonth.click()
# finding the End year from scroll
element_drop_down_endYear = driver.find_element_by_xpath('//*[@id="toYear"]/option[1]')
element_drop_down_endYear.click()
# finding the End month from scroll
element_drop_down_endmonth = driver.find_element_by_xpath('//*[@id="toMonth"]/option[5]')
element_drop_down_endmonth.click()
# finding the specific Chapter
element_drop_down_specificChapter = driver.find_element_by_xpath('//*[@id="report_hs"]/option[1]')
element_drop_down_specificChapter.click()
time.sleep(1)
# finding the specific Commodity from the list
element_drop_down_specific_commodity = driver.find_element_by_xpath('//*[@id="report_hs"]/option[2]')
element_drop_down_specific_commodity.click()
# finding the specific Commodity from the list
element_drop_down_specific_button= driver.find_element_by_xpath('//*[@id="report"]/div[1]/div[3]/div[5]/p[2]/button')
element_drop_down_specific_button.click()
#--------------------------------------------------------------------
cel = 1
for cel in rane(25):
x = driver.find_element_by_xpath('//*[@id="report_table"]/tbody/tr[1]/td[2]/a')
print(x)
print("//*[@id="report_table"]/tbody/tr[" cel "]/td[4]")
print("//*[@id="report_table"]/tbody/tr[" cel "]/td[7]")
print("//*[@id="report_table"]/tbody/tr[" cel "]/td[8]/abbr")
time.sleep(3)
CodePudding user response:
You need to find the element before printing it, otherwise you're printing a string. I think what you want to do is in each iteration of the for loop print those selectors? if so find the elements like so, then print them.
for i in range(25):
x = driver.find_element_by_xpath('//*[@id="report_table"]/tbody/tr[1]/td[2]/a')
print(x)
element_1 = driver.find_element_by_xpath(f'//*[@id="report_table"]/tbody/tr[{i}]/td[4]')
element_2 = driver.find_element_by_xpath(f'//*[@id="report_table"]/tbody/tr[{i}]/td[7]')
element_3 = driver.find_element_by_xpath(f'//*[@id="report_table"]/tbody/tr[{i}]/td[8]/abbr')
CodePudding user response:
If you inspect the Network tab, you can see that webpage is pulling the table data from https://www150.statcan.gc.ca//t1/cimt/rest/getReport/(1)/0/0/12/0/150000/1/0/2022-01-01/2022-05-01
Scrape that json page instead:
import requests r = requests.get('https://www150.statcan.gc.ca//t1/cimt/rest/getReport/(1)/0/0/12/0/150000/1/0/2022-01-01/2022-05-01') print(r.json())