Home > Enterprise >  Why scrapy "FEEDS" not saving the output file?
Why scrapy "FEEDS" not saving the output file?

Time:11-05

The purpose of this scrapy spider is to check the response status of a bunch of websites. I'm also trying to build a simple desktop GUI app using python tkinter for this spider as an experiment. This app is scraping as I needed but it is not saving the output in a file. I have asked this question previously but this time providing more details.

spider

import scrapy
import pandas as pd
from twisted.internet.error import DNSLookupError


class CheckSpider(scrapy.Spider):
    name = 'check'

    def read_xl(df):
        df = pd.read_excel('url.xlsx')
        return df['url'].tolist()

    def start_requests(self):
        for value in self.read_xl():
            yield scrapy.Request(
                url=value,
                callback= self.parse,
                errback=self.parse_error
            )
        return super().start_requests()

    def parse_error(self, failure):
        if failure.check(DNSLookupError):
            request = failure.request
            yield {
                'URL': request.url,
                'Status': failure.value
            }

    def parse(self, response):
        if response.request.meta.get('redirect_urls'):
            yield {
                'URL': response.request.meta.get('redirect_urls')[0],
                'Redireted URL': response.request.url,
                'Status': response.status
            }
        else:
            yield {
                'URL': response.url,
                'Redireted URL': response.request.url,
                'Status': response.status
            }

GUI (apps.py)

from tkinter import *
from tkinter import messagebox
from tkinter import filedialog
from scrapy.utils import project
from scrapy import spiderloader
from scrapy.utils.log import configure_logging
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor
import threading


def get_spiders():
    settings = project.get_project_settings()
    spider_loader = spiderloader.SpiderLoader.from_settings(settings)
    return spider_loader.list()

def get_chosen_spider(value):
    global chosen_spider
    chosen_spider = value
    return chosen_spider

def get_chosen_feed(value):
    global chosen_feed
    chosen_feed = value
    return chosen_feed

def browse_button():
    global folder_path
    folder_path = filedialog.askdirectory()
    folder_path_entry.delete(0, END)
    folder_path_entry.insert(0, folder_path)
    return folder_path

def execute_spider():
    if dataset_entry.get() == '' or chosen_feed not in ['CSV', 'JSON']:
        messagebox.showerror('Error', 'All entries are required')
        return
    try:
        feed_uri = f'file:///{folder_path}/{dataset_entry.get()}.{chosen_feed}'
    except:
        messagebox.showerror('Error', 'All entries are required')

    settings = project.get_project_settings()
    # settings.set('FEED_URI', feed_uri)
    # settings.set('FEED_TYPE', chosen_feed)
    settings.set("FEEDS", {
        f'output.{chosen_feed}': {
            'format': chosen_feed,
            'encoding': 'utf8'
        }
    })

    configure_logging()
    runner = CrawlerRunner(settings)
    runner.crawl(chosen_spider)

    reactor.run(installSignalHandlers=False)

def start_execute_thread(event):
    global execute_thread
    execute_thread = threading.Thread(target=execute_spider, daemon=True)
    execute_thread.start()
    app.after(10, check_execute_thread)

def check_execute_thread():
    if execute_thread.is_alive():
        app.after(10, check_execute_thread)


app = Tk()
# app title
app.title('Check Website Status')
# app size
app.geometry('300x200')
app.resizable(False, False)

# app label
spider_label = Label(app, text='Choose a spider')
spider_label.grid(row=0, column=0, sticky=W, pady=10, padx=10)

# Choose Spider
spider_text = StringVar(app)
spider_text.set('Choose a spider')
spiders = [spider for spider in get_spiders()]

spiders_dropdown = OptionMenu(app, spider_text, *spiders, command=get_chosen_spider)
spiders_dropdown.grid(row=0, column=1, columnspan=2)

# Feed Type
feed_label = Label(app, text='Choose a feed')
feed_label.grid(row=1, column=0, sticky=W, pady=10, padx=10)

feed_text = StringVar(app)
feed_text.set('Choose a spider')
feeds = ['CSV', 'JSON']

feed_dropdown = OptionMenu(app, feed_text, *feeds, command=get_chosen_feed)
feed_dropdown.grid(row=1, column=1, columnspan=2)

# path entry
folder_path_text = StringVar(app)
folder_path_entry = Entry(app, textvariable=folder_path_text)
folder_path_entry.grid(row=2, column=0, pady=10, padx=10)

# Dataset entry
dataset_text = StringVar(app)
dataset_entry = Entry(app, textvariable=dataset_text, width=10)
dataset_entry.grid(row=2, column=1, pady=10, padx=10)

browse_btn = Button(app, text='Browse', command=browse_button)
browse_btn.grid(row=2, column=2,)

execute_btn = Button(app,text='Execute', command=lambda: start_execute_thread(None))
execute_btn.grid(row=3, column=0, columnspan=3)

app.mainloop()

Output

{'URL': 'https://equbot.com/our-customers/', 'Redireted URL': 'https://equbot.com/our-customers/', 'Status': 200}
2022-11-03 00:59:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fincura.com/> (referer: None)
2022-11-03 00:59:43 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.buxtonco.com/customers>
{'URL': 'https://www.buxtonco.com/clients', 'Redireted URL': 'https://www.buxtonco.com/customers', 'Status': 200}
2022-11-03 00:59:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.features-analytics.com/> (referer: None)
2022-11-03 00:59:43 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.fincura.com/>
{'URL': 'https://www.fincura.com/', 'Redireted URL': 'https://www.fincura.com/', 'Status': 200}
2022-11-03 00:59:43 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.eventus.com/> from <GET https://www.eventussystems.com/>
2022-11-03 00:59:43 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.features-analytics.com/>
{'URL': 'https://www.features-analytics.com/', 'Redireted URL': 'https://www.features-analytics.com/', 'Status': 200}
2022-11-03 00:59:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://fincad.com/> (referer: None)
2022-11-03 00:59:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.featurespace.com/customers/> (referer: None)
2022-11-03 00:59:43 [scrapy.core.scraper] DEBUG: Scraped from <200 https://fincad.com/>
{'URL': 'https://fincad.com/', 'Redireted URL': 'https://fincad.com/', 'Status': 200}
2022-11-03 00:59:43 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.featurespace.com/customers/>
{'URL': 'https://www.featurespace.com/customers/', 'Redireted URL': 'https://www.featurespace.com/customers/', 'Status': 200}

CodePudding user response:

There is something wrong with your chosen_feeds variable. I couldn't get your app to work it kept showing a messagebox that I had to fill in all the values even though they were filled.

But I was able to get it to work once I commented out a bunch of stuff and set the hardcoded the chosen_feeds variable to json and it works with csv too.


update:

Upon further investigation, I can confirm that it is because of the ALL_CAPS that you are using for the CSV and JSON labels. Scrapy requires that you use lowercase characters.

Simply setting the chosen_feed variable to lower case letters solves the problem.

For example:

def execute_spider():
    if dataset_entry.get() == '' or chosen_feed not in ['CSV', 'JSON']:
        messagebox.showerror('Error', 'All entries are required')
        return
    try:
        feed_uri = f'file:///{folder_path}/{dataset_entry.get()}.{chosen_feed}'
    except:
        messagebox.showerror('Error', 'All entries are required')
    chosen_feed = chosen_feed.lower()   # <- added this
    settings = project.get_project_settings()
    settings.set("FEEDS", {
        f'output.{chosen_feed}': {
            'format': chosen_feed,
            'encoding': 'utf8'
        }
    })

    configure_logging()
    runner = CrawlerRunner(settings)
    runner.crawl(chosen_spider)

    reactor.run(installSignalHandlers=False)

from tkinter import *
from tkinter import messagebox
from tkinter import filedialog
from scrapy.utils import project
from scrapy import spiderloader
from scrapy.utils.log import configure_logging
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor
import threading

def get_spiders():
    settings = project.get_project_settings()
    spider_loader = spiderloader.SpiderLoader.from_settings(settings)
    return spider_loader.list()

def browse_button():
    global folder_path
    folder_path = filedialog.askdirectory()
    folder_path_entry.delete(0, END)
    folder_path_entry.insert(0, folder_path)
    return folder_path

def execute_spider():
    chosen_spider = spider_text.get()
    chosen_feed = feed_text.get()
    if dataset_entry.get() == '' or chosen_feed not in ['csv', 'json']:
        messagebox.showerror('Error', 'All entries are required')
        return
    try:
        feed_uri = f'file:///{folder_path}/{dataset_entry.get()}.{chosen_feed}'
    except:
        messagebox.showerror('Error', 'All entries are required')

    settings = project.get_project_settings()
    # settings.set('FEED_URI', feed_uri)
    # settings.set('FEED_TYPE', chosen_feed)
    settings.set("FEEDS", {
        f'output.{chosen_feed}': {
            'format': chosen_feed,
            'encoding': 'utf8'
        }
    })

    configure_logging()
    runner = CrawlerRunner(settings)
    runner.crawl(chosen_spider)

    reactor.run(installSignalHandlers=False)

def start_execute_thread(event):
    global execute_thread
    execute_thread = threading.Thread(target=execute_spider, daemon=True)
    execute_thread.start()
    app.after(10, check_execute_thread)

def check_execute_thread():
    if execute_thread.is_alive():
        app.after(10, check_execute_thread)


app = Tk()
# app title
app.title('Check Website Status')
# app size
app.geometry('300x200')
app.resizable(False, False)

# app label
spider_label = Label(app, text='Choose a spider')
spider_label.grid(row=0, column=0, sticky=W, pady=10, padx=10)

# Choose Spider
spider_text = StringVar(app)
spider_text.set('Choose a spider')
spiders = [spider for spider in get_spiders()]

spiders_dropdown = OptionMenu(app, spider_text, *spiders)
spiders_dropdown.grid(row=0, column=1, columnspan=2)

# Feed Type
feed_label = Label(app, text='Choose a feed')
feed_label.grid(row=1, column=0, sticky=W, pady=10, padx=10)

feed_text = StringVar(app)
feed_text.set('Choose a feed')
feeds = ['csv', 'json']

feed_dropdown = OptionMenu(app, feed_text, *feeds)
feed_dropdown.grid(row=1, column=1, columnspan=2)

# path entry
folder_path_text = StringVar(app)
folder_path_entry = Entry(app, textvariable=folder_path_text)
folder_path_entry.grid(row=2, column=0, pady=10, padx=10)

# Dataset entry
dataset_text = StringVar(app)
dataset_entry = Entry(app, textvariable=dataset_text, width=10)
dataset_entry.grid(row=2, column=1, pady=10, padx=10)

browse_btn = Button(app, text='Browse', command=browse_button)
browse_btn.grid(row=2, column=2,)

execute_btn = Button(app,text='Execute', command=lambda: start_execute_thread(None))
execute_btn.grid(row=3, column=0, columnspan=3)

app.mainloop()

  • Related