Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you.these is the page link https://www.askgamblers.com/online-casinos/countries/uk/
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
for page in range(1,3):
URL = 'https://www.askgamblers.com/online-casinos/countries/uk/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[@class='card__desc']//a[starts-with(@href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
product=[]
for url in urls:
wev={}
driver.get(url)
time.sleep(1)
try:
title=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
wev['Title']=title
soup = BeautifulSoup(driver.page_source,"lxml")
pays=soup.select("div#tabPayments")
for pay in pays:
try:
t1=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['deposit_method']=t1
try:
t2=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item .review-details__item .review-details__text").get_text(' ',strip=True)
except:
pass
wev['curriences']=t2
try:
t3=pay.select_one(" .review-details-wrapper .review-details-wrapper .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['with_drawl method']=t3
try:
t4 = pay.select_one(" .review-details-wrapper .review-details-wrapper .review-details__item:nth-child(2) .review-details__text")
t4 = [i.replace("\n", "") for i in t4 if i.text]
except:
pass
wev['with_drawl_time']=t4
product.append(wev)
df=pd.DataFrame(product)
df.to_csv('casino.csv')
CodePudding user response:
All result in 1 file :
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
product=[]
for page in range(1,4):
URL = 'https://www.askgamblers.com/online-casinos/countries/uk/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[@class='card__desc']//a[starts-with(@href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
for url in urls:
wev={}
driver.get(url)
time.sleep(1)
try:
title=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
wev['Title']=title
soup = BeautifulSoup(driver.page_source,"lxml")
pays=soup.select("div#tabPayments")
for pay in pays:
try:
t1=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['deposit_method']=t1
try:
t2=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item .review-details__item .review-details__text").get_text(' ',strip=True)
except:
pass
wev['curriences']=t2
try:
t3=pay.select_one(" .review-details-wrapper .review-details-wrapper .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['with_drawl method']=t3
try:
t4 = pay.select_one(" .review-details-wrapper .review-details-wrapper .review-details__item:nth-child(2) .review-details__text")
t4 = [i.replace("\n", "") for i in t4 if i.text]
except:
pass
wev['with_drawl_time']=t4
product.append(wev)
df=pd.DataFrame(product)
df.to_csv('casino.csv')
CodePudding user response:
- In first loop its running only 2 times : Change it to 1,4 as below then it will give you [1,2,3]:
for page in range(1,4):
- Then data getting overwritten because output file name is same: change file name as below:
df.to_csv(f'casino_{page}.csv')