Convert list of comma separated strings into individual rows in excel-CodePudding

I am scraping this website https://www.handbook.fca.org.uk/handbook/PRIN/2A/?date=2030-12-01&timeline=True&view=chapter using scrapy and I have been able to collect all the relevant data but couldn't figure out a way to present it in readable form. Here is my code so far:

import scrapy
from urllib.parse import urlencode
from scrapy.selector import Selector


class HandBook(scrapy.Spider):
    name = "handbook_spider"

    custom_settings = {
        "LOG_FILE": "handbook_spider.log",
        "ITEM_PIPELINES": {
            "handbook_spider.pipelines.HandbookExcelPipeline": 300,
        },
    }

    headers = {
        "authority": "www.handbook.fca.org.uk",
        "accept": "text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "accept-language": "en,ru;q=0.9",
        "cache-control": "max-age=0",
        "sec-ch-ua": '"Chromium";v="106", "Yandex";v="22", "Not;A=Brand";v="99"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "cross-site",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 YaBrowser/22.11.3.832 (beta) Yowser/2.5 Safari/537.36",
    }

    params = {
        "date": "2030-12-01",
        "timeline": "True",
        "view": "chapter",
    }

    url = "https://www.handbook.fca.org.uk/handbook/PRIN/2A/?"

    def start_requests(self):
        base_url = self.url   urlencode(self.params)
        yield scrapy.Request(
            url=base_url, headers=self.headers, callback=self.parse_details
        )

    def parse_details(self, response):
        item = {}
        item["unique_ids"] = ""
        item["chapter_ref"] = ""
        item["chapter"] = ""
        item["topics"] = ""
        item["clauses"] = ""
        handbook_content = response.css("div.handbook-content").extract()
        for content in handbook_content:
            str_content = Selector(text=content)
            item["unique_ids"] = str_content.css(
                "div.details > span.extended::text"
            ).extract()
            item["chapter_ref"] = str_content.css("h1 > span.extended::text").get()
            item["chapter"] = str_content.css("h1::text").extract()[-1]
            item["topics"] = str_content.css("h2.crosstitle::text").extract()
            item["clauses"] = [
                c.split(".")[-1]
                for c in str_content.css("div.details > span.extended::text").extract()
            ]

            yield item

pipeline:

from scrapy import signals
import os
import pandas as pd
from scrapy.exporters import CsvItemExporter

import logging


class HandbookExcelPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        export_file = open("%s_items_excel.csv" % spider.name, "wb")
        self.file = export_file
        self.files[spider] = self.file
        self.exporter = CsvItemExporter(self.file)
        self.exporter.encoding = "utf-8"
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
        try:
            df = pd.read_csv("%s_items_excel.csv" % spider.name).fillna("")
            # sorted_df = df.sort_values(by=["unique_ids"])
            output_path = "%s-items-final.xlsx" % spider.name
            with pd.ExcelWriter(output_path, engine="xlsxwriter") as writer:
                explode_cols.to_excel(
                    writer, sheet_name="Sheet1", header=True, index=False
                )

            # os.remove("%s_items_excel.csv" % spider.name)
        except pd.errors.EmptyDataError:
            spider.log("Excel pipeline No columns to parse from file", logging.ERROR)
            os.remove("%s_items_excel.csv" % spider.name)

My goal is to gather all the data and write into the excel file. The above pipeline gives this output: https://docs.google.com/spreadsheets/d/1PqrCplqizK7jFoA55wEs8iTNvYYrouu3/edit?usp=sharing&ouid=104460094113947817552&rtpof=true&sd=true

and expected output is: https://docs.google.com/spreadsheets/d/1iaais6DiXt2JJIi78qwwKJRtVDpBxwQf/edit?usp=sharing&ouid=104460094113947817552&rtpof=true&sd=true

Note: I have given the google sheets full access to excel files above.

CodePudding user response：

There are a couple of things you can do to get closer to the output you desire.

All of the items yielded to the feed processor or pipeline should be unique. This would mean that each iteration of your loop you create and item item = {}, then yield that unique item yield item, then on the next iteration do it again. Right now you create one item and then update it over and over again, and since dictionaries are mutable, if recently yielded item hasn't been processed by the pipeline yet then you will be editing that items information before it has had a chance to be written to the file.
If you want a separate row dedicated for each and every clause for each and every topic on the page, then that is the number of items you should expect to yield. Right now you are only yielding an item for each topic and lumping all of the subsections into one item, which is why you only end up with 1 row per topic.

Implementing the above strategies into your spider could look something like this:

import scrapy
from urllib.parse import urlencode
from scrapy.selector import Selector


class HandBook(scrapy.Spider):
    name = "handbook_spider"

    custom_settings = {
        "LOG_FILE": "handbook_spider.log",
        # "ITEM_PIPELINES": {
        #    "handbook_spider.pipelines.HandbookExcelPipeline": 300,
        # },
    }

    headers = {
        "authority": "www.handbook.fca.org.uk",
        "accept": "text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "accept-language": "en,ru;q=0.9",
        "cache-control": "max-age=0",
        "sec-ch-ua": '"Chromium";v="106", "Yandex";v="22", "Not;A=Brand";v="99"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "cross-site",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 YaBrowser/22.11.3.832 (beta) Yowser/2.5 Safari/537.36",
    }

    params = {
        "date": "2030-12-01",
        "timeline": "True",
        "view": "chapter",
    }

    url = "https://www.handbook.fca.org.uk/handbook/PRIN/2A/?"

    def start_requests(self):
        base_url = self.url   urlencode(self.params)
        yield scrapy.Request(
            url=base_url, headers=self.headers, callback=self.parse_details
        )

    def parse_details(self, response):
        for content in response.css("div.handbook-content"):
            chapter_ref = content.xpath("./header/h1/span[@class='extended']/text()").get()
            chapter = ''.join(content.xpath("./header/h1/text()").getall()).strip()
            topic = None
            for section in content.css('section'):
                header = section.css('header')
                if header:
                    topic = header.css("h2.crosstitle::text").get()
                else:
                    content = section.xpath(
                        ".//div[@class='section-content']//text()").getall()
                    clause = " ".join(list(map(str.strip, content)))
                    uid = section.xpath(
                            ".//span[@class='extended']/text()").get()
                    yield {
                        "chapter_ref": chapter_ref,
                        "chapter": chapter,
                        'topic': topic,
                        "unique_ids": uid,
                        "clause": clause,
                        "date": section.xpath(".//time/span/text()").get(),
                    }

You will notice that I didn't even bother constructing the item until I am already yielding it, which is one means of making sure each item is unique

Note: I didn't actually test this with your pipeline so it will likely need some debugging, but if you just use scrapy's built in feed processor it comes a whole lot closer to what you have on the spreadsheet with your desired output.