Home > Enterprise >  scraping and storing scraped information to a csv file
scraping and storing scraped information to a csv file

Time:10-15

How can I get the scraped information to be put into a csv file and then close the tab and write it into a new one and loop it through until all of the pages in the forum has been scraped I'm still learning more about web scraping and I'm completely stuck on this the div name that needs to be scraped is "post-content" but it doesn't show the correct information when I'm testing it

import driver as driver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common import window
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import csv

options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)
options.add_argument("start-maximized")
wait = WebDriverWait(driver, 100)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get("https://navalcommand.enjin.com/forum/viewforum/2989694/m/11178354/page/1")
elems = driver.find_elements(By.XPATH, "//table[@class='structure small-cells']//a[@href]")
links = []

# create csv file
f = open(r"C:\Users\jammi\OneDrive\Desktop\Navcom\test.csv", 'w', encoding='UTF8')
csvWriter = csv.writer(f)

# to open every thread link
for ele in elems:
    if "viewthread" in ele.get_attribute("href"):
        links.append(ele.get_attribute("href"))
        links = list(dict.fromkeys(links))
        print(elems)
# to open every link into a new tab
for link in links:
    driver.switch_to.new_window(window.WindowTypes.TAB)
    driver.get(link)

# write the scraped information to a csv file
content = driver.find_elements(By.CLASS_NAME, "post-content")
print(content)
csvWriter.writerow([content])

CodePudding user response:

You could use the xpath again for the post-content search.

content = driver.find_elements(By.XPATH, "//div[@class='post-content']")

CodePudding user response:

Your URL is blocked while accessing it through Selenium. So, I have used another URL and modified your code, just take the logic from the below code and also modify the locators as per your need:

This code will get all the URLs from the main page, iterate through and open each link in a new tab, print the content and save the content to '.csv' file, close the tab then move to the next link.

driver.get("https://ubuntuforums.org/forumdisplay.php?f=326")           # Change this URL to your URL

elems = driver.find_elements(By.XPATH, "//*[@class='threadtitle']//a[@href]")           # Change this XPath as per your website
print("Length: ", len(elems))
links = []

# create csv file
f = open(r"C:\Users\<user name>\Downloads\test.csv", 'w', encoding='UTF8')                  # modify this path
csvWriter = csv.writer(f)

# to open every thread link
for i in range(len(elems)):
    if "showthread" in elems[i].get_attribute("href"):              # Change 'showthread' to the original - 'viewthread'
        links.append(elems[i].get_attribute("href"))
        links = list(dict.fromkeys(links))                      # I am not sure why you are using this line here, anyway that's your decision
        # print("Elements: ", elems)

print("Links: ", links)
print("")

# to open every link into a new tab
for link in links:
    driver.switch_to.new_window(window.WindowTypes.TAB)
    driver.get(link)
    time.sleep(3)
    print("Contents of '", driver.title, "' page")
    print("----")
    # write the scraped information to a csv file
    no_of_content = driver.find_elements(By.CSS_SELECTOR, ".postcontent.restore")               # Change '.postcontent.restore' to the original - '.post-content', but use CSS_SELECTOR
    for i in range(len(no_of_content)):
        print("Content: ", no_of_content[i].text)
        csvWriter.writerow([no_of_content[i].text])
        time.sleep(1)
    print("=============End of the page=================")
    print("")
    time.sleep(1)
    driver.close()
    driver.switch_to.window(driver.window_handles[0])
        
    
  • Related