How to use Scrapy to parse PDFs without a specific .pdf-link?-CodePudding

I try to download PDFs but in case of https://ratsinformation.stadt-koeln.de/si0057.asp?__ksinr=23723 I see no .pdf links which could be grabbed by Scrapy. This example shows the missing .pdf in the URL https://ratsinformation.stadt-koeln.de/getfile.asp?id=850608&type=do.

Is Scrapy also able to handle getfile.asp links to detect the file itself?

This is the approach to get all the pdf links on a specific page:

import scrapy
from scrapy.pipelines.files import FilesPipeline


class PdfPipeline(FilesPipeline):
    # to save with the name of the pdf from the website instead of hash
    def file_path(self, request, response=None, info=None):
        file_name = request.url.split('/')[-1]
        return file_name


class StadtKoelnAmtsblattSpider(scrapy.Spider):
    name = 'stadt_koeln_amtsblatt'
    start_urls = ['https://ratsinformation.stadt-koeln.de/si0057.asp?__ksinr=23723']

    custom_settings = {
        "ITEM_PIPELINES": {
            PdfPipeline: 100
        },
        "FILES_STORE": "downloaded_files"
    }

    def parse(self, response):
        links = response.xpath("//a[@class='btn btn-blue']/@href").getall()
        links = [response.urljoin(link) for link in links]  # to make them absolute urls

        yield {
            "file_urls": links
        }

I am receiving an error for each of the attempts to download the files.

OSError: [Errno 22] Invalid argument: 'downloaded_files\\getfile.asp?id=821665&type=do'

CodePudding user response：

The error is caused by the PdfPipeline because the url does not have the filename so you have to obtain the filename in the parse method and then capture the name in the pipeline like below.

import scrapy
from scrapy.pipelines.files import FilesPipeline


class PdfPipeline(FilesPipeline):
    # to save with the name of the pdf from the website instead of hash
    def file_path(self, request, response=None, info=None, *, item=None):
        return item["filename"]


class StadtKoelnAmtsblattSpider(scrapy.Spider):
    name = 'stadt_koeln_amtsblatt'
    start_urls = ['https://ratsinformation.stadt-koeln.de/si0057.asp?__ksinr=23723']

    custom_settings = {
        "ITEM_PIPELINES": {
            PdfPipeline: 100
        },
        "FILES_STORE": "downloaded_files",
        "USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36"
    }

    def parse(self, response):
        for i, item in enumerate(response.xpath("//a[contains(@title, 'Dokument Download')]")):
            title = item.xpath("./text()").get()
            urls = item.xpath("./@href").getall()
            if title:
                yield {
                    "filename": title   str(i)   ".pdf",# to take care of duplicated file names
                    "file_urls": [response.urljoin(url) for url in urls]
                }