I have created a scraper using python selenium. **Technology used:**Python,Selenium When i run the scraper it has to fetch the parent element for all the elements in the webpage eg:Button,Image. I am pasting the code below:
#!/usr/bin/python3
# Description: The Python code below will search selenium in Google.
import time
import csv
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
#EVERYTIME CHANGE THE DRIVER PATH TO THE CHROME DRIVER FOR LATEST CHROME VERSION
driver = webdriver.Chrome(
executable_path="D:\Scraper\chromedriver.exe")
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
contents = []
filePath = 'output1.csv'
# As file at filePath is deleted now, so we should check if file
# exists or not not before deleting them
if os.path.exists(filePath):
os.remove(filePath)
else:
print("Can not delete the file as it doesn't exists")
f = open("output1.csv", "a ")
f.write("website," "title," "htmltag," "type," "id," "classname," "for," "href," "alt," "type," "src,"
"name," "width," "height," "data-src," 'inner-text,' 'action,' 'value,' "\n")
file = open("inputLinks1.csv")
csvreader = csv.reader(file)
rows = []
for row in csvreader:
inputlinks = ''.join(str(e) for e in row) #convert to string to solve the usb issue
driver.get(inputlinks)
get_title = driver.title
with open('xpathtags.csv', 'rt') as cp2_csv:
cp_url2 = csv.reader(cp2_csv)
for row1 in cp_url2:
print(row[0])
(xtype, xpathtext) = row1[0].split(';')
print(xtype, xpathtext)
contents.append(xtype)
contents.append(xpathtext)
elems = driver.find_elements_by_xpath(xpathtext)
for elem in elems:
f = open('output1.csv', 'a ', encoding='utf-8')
f.write( inputlinks
"~ " get_title "~ " # title
" " xtype "~ " #dom type - links, image, etc
str(elem.get_attribute('type')).strip() '~ '
str(elem.get_attribute('id')).strip() '~ '
str(elem.get_attribute('class')).strip() '~ '
str(elem.get_attribute('for')).strip() '~ '
str(elem.get_attribute('href')).strip() '~ '
str(elem.get_attribute('alt')).strip() '~ '
str(elem.get_attribute('type')).strip() '~ '
str(elem.get_attribute('src')).strip() '~ '
str(elem.get_attribute('name')).strip() '~ '
str(elem.get_attribute('width')).strip() '~ '
str(elem.get_attribute('height')).strip() '~ '
str(elem.get_attribute('data-src')).strip() '~ '
str(elem.get_attribute('innerText').strip()) '~ '
str(elem.get_attribute('action')).strip() '~ '
str(elem.get_attribute('value')).strip() '~ '
'|~'
'\n')
parent_elem = elems.find_element_by_xpath('..')
print("Parent class attribute: " parent_elem.get_attribute("class"))
f.close()
file.close()
f.close()
driver.close()
I have added a xpathtags file which will fetch all the elements in the webpage.
Link;//a[@href]
Button;//button
Image;//img
Heading1;//h1
Heading2;//h2
Heading3;//h3
Heading4;//h4
Div;//div
Span;//span
https://www.flipkart.com
https://www.ebay.com
Error:
Link //a[@href]
Traceback (most recent call last):
File "demo.py", line 82, in <module>
write_output(row)
File "demo.py", line 77, in write_output
parent_elem = elems.find_element_by_xpath('..')
AttributeError: 'list' object has no attribute 'find_element_by_xpath'
CodePudding user response:
Below line return the list of the Elements
: find_elements_
elems = driver.find_elements_by_xpath(xpathtext)
And while trying to find parent
tag class
you are passing a list
not an WebElement
.
parent_elem = elems.find_element_by_xpath('..')
Replace this - parent_elem = elems.find_element_by_xpath('..')
to
parent_elem = elem.find_element_by_xpath('..')