I am trying to scrape company information from a company database. I have a list of companies in a text file, which I would want selenium to enter into the website's search and scrape the needed info one by one.
My problem is that for some reason it only enters the last name on the list for some reason. How would I be able to tell python to scrape the first company name on the list, then the next one and so on?
My code is the following:
# -*- coding: utf-8 -*-
# from typing_extensions import Self
from lib2to3.pgen2 import driver
import scrapy
from scrapy.selector import Selector
# from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from shutil import which
count = 0
file = open ("cegek.txt", "r")
lines = file.readlines()
for line in lines:
count = 1
# # cegek = "1000 Út Kft."
class HtSpiderSeleniumceg(scrapy.Spider):
name = 'kamara'
allowed_domains = ["wwww.ceginfo.hu/"]
start_urls = [
'https://www.ceginfo.hu'
]
def __init__(self):
chrome_options = Options()
# chrome_options.add_argument("--headless")
#get login page
driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
driver.get("https://www.ceginfo.hu/")
driver.find_element_by_xpath("//input[@type='search']").send_keys(line)
sleep(2)
driver.find_element_by_xpath("//input[@type='search']").send_keys(u'\ue007')
self.html = driver.page_source
driver.close()
#scrape needed info
def parse(self, response):
resp = Selector(text=self.html)
for ceg in resp.xpath("(//div[contains(@class, 'd-flex flex-column flex-sm-row justify-content-between align-items-center')])[1]"):
yield {
'cegnev': ceg.xpath("(//h2[contains(@class,'s-title heading')])[1]/text()").get(),
'adoszam': ceg.xpath("(.//span[@class='text-uppercase c-border me-lg-3'])[1]/text()").get(),
'cegjegy': ceg.xpath("(.//span[@class='c-border'])[1]/text()").get()
}
This is the exact format the company names list is in:
SZIMIKRON Ipari Kft.
Tigra Computer- és Irodatechnikai Kft.
Tradeland Kft.
Török László EV Török Kulcsszervíz
Tungsram Operations Kft.
Tutti Élelmiszeripari Kft.
Water and Soil Kft.
Webkey Development Kft.
ZDMnet
CodePudding user response:
I can't completely replicate your code without installing Selenium, web driver, etc, but this is how you would implement the solution.
Write a function to read names from cegek.txt and append to a list:
names_to_search = []
def get_names_to_search():
# open file to read
file = open ("cegek.txt", "r")
# read lines in file
lines = file.readlines()
# loop through file and append names to list
for line in lines:
names_to_search.append(line.strip())
# The names_to_search list will contain:
['SZIMIKRON Ipari Kft.', 'Tigra Computer- és Irodatechnikai Kft.', 'Tradeland Kft.', 'Török László EV Török Kulcsszervíz', 'Tungsram Operations Kft.', 'Tutti Élelmiszeripari Kft.', 'Water and Soil Kft.', 'Webkey Development Kft.', 'ZDMnet']
Loop through names_to_search
and pass each name to driver.find_element_by_xpath("//input[@type='search']").send_keys(name)
for name in names_to_search:
driver.find_element_by_xpath("//input[@type='search']").send_keys(name)