I want to read url but they give me error that URL must be a string
is there any possible solution for that kindly recommend me how to make the URL string kindly suggest any method
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
with open("data.csv") as file:
start_urls=[line.strip() for line in file]
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(start_urls)
CodePudding user response:
As commented by @JeJe,
you need to iterate through your start_urls list
start_urls
is a LIST of strings, and driver.get
expects just one url as string - something like
# driver.get(start_urls) # NOT like this
for sUrl in start_urls:
driver.get(sUrl)
### SCRAPE AS NEEDED ###
or, if you want to keep track of progress, something like
suLen = len(start_urls)
for si, sUrl in enumerate(start_urls):
print(f'[{si} of {suLen}] reading {sUrl}')
driver.get(sUrl)
### SCRAPE AS NEEDED ###
Btw, you don't need to do everything under with open...
- getting start_urls
is enough:
with open("data.csv") as file:
start_urls=[line.strip() for line in file]
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
suLen = len(start_urls)
for si, sUrl in enumerate(start_urls):
print(f'[{si} of {suLen}] reading {sUrl}')
driver.get(sUrl)
### SCRAPE AS NEEDED ###
but you can only use just start_urls=[line.strip() for line in file]
if you csv has only on column with no headers.
CodePudding user response:
you could rewrite your code using functions to deal with each step of your code.
def read_file(has_header:bool=True):
with open("data.csv") as file:
start_urls = [line.strip() for line in file]
# assumes there is a header
if has_header:
return start_urls[1:]
else:
return start_urls
def scrap_urls():
urls = read_file(has_header=True)
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
for url in urls:
driver.get(url)
if __name__ == '__main__':
scrap_urls()