How to speed up the python requests in Scrapy?-CodePudding

I am scraping this website https://www.saksfifthavenue.com/c/women-s-apparel. There are various product variants for each product like variations of sizes and colors as well as availability per variant but not all in one place I have to send an additional request to the custom url which includes prod_id, color and size this is the part where I am loosing the speed of scrapy because those additional requests are making scraping pretty slow and

I'd like to have a work around because I have a requirement to finish the scraper in under 6 hours so right now it's been over 5 hours and only 3k products has been scraped in total due to those variant requests processing one by one. And I'd like to speed things up for example processing those additional requests faster. Here is my code:

import scrapy
from urllib.parse import urlencode
import requests
from scrapy.selector import Selector
import re
import json
import html


class SaksFifthAvenueSpider(scrapy.Spider):
    name = "safa-feeds"

    # custom settings
    custom_settings = {
        "LOG_FILE": "saks_fifth_avenue.log",
        "ITEM_PIPELINES": {
            "sdt.pipelines.SdtWasabiS3Pipeline": 300,
        },
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
        "Accept": "text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "cross-site",
        "Sec-GPC": "1",
    }

    params = {
        "cgid": "",
        "start": "0",
        "sz": "24",
        "hideLess": "true",
    }

    base_url = "https://www.saksfifthavenue.com/on/demandware.store/Sites-SaksFifthAvenue-Site/en_US/Search-UpdateGrid?"

    def start_requests(self):
        cgid_list = [
            "2534374306418048",
            "2534374306624247",
            "2534374306622828",
            "1608313652004",
            "2534374306418050",
            "2534374306418162",
            "2534374306418054",
            "1654108780232",
            "2534374306418205",
            "2534374306418206",
            "2534374306418217",
            "2534374306418192",
            "1608313652004",
            "2534374306418053",
        ]
        for cgid in cgid_list:
            self.params["cgid"] = cgid
            category_url = self.base_url   urlencode(self.params)
            yield scrapy.Request(
                url=category_url, headers=self.headers, callback=self.parse_page_items
            )

    def parse_page_items(self, response):
        item_links = set(
            [
                "https://www.saksfifthavenue.com"   u.split("?")[0]
                for u in response.css("a.thumb-link.mw-100::attr(href)").extract()
            ]
        )
        inner_load = response.css("div.show-more ::attr(data-url)").get()
        if inner_load:
            yield scrapy.Request(
                url=inner_load, headers=self.headers, callback=self.parse_page_items
            )
        # next_page_no = response.css('a[aria-label="Next"]::attr(href)').get()
        # if next_page_no:
        #     self.params["start"] = next_page_no.split("&")[0].split("=")[-1]
        #     next_url = self.base_url   urlencode(self.params)
        #     yield scrapy.Request(
        #         url=next_url, headers=self.headers, callback=self.parse_page_items
        #     )

        for link in item_links:
            yield scrapy.Request(
                url=link, headers=self.headers, callback=self.parse_product_details
            )

    def parse_product_details(self, response):
        item = {}
        json_text = (
            response.css('script[type="application/ld json"]')
            .get()
            .replace('<script type="application/ld json">', "")
            .replace("</script>", "")
        )
        json_blob = json.loads(json_text)
        prod_id = response.css("div.container.product-detail::attr(data-pid)").get()
        colors = response.css("button::attr(data-adobelaunchproductcolor)").extract()
        sizes = response.css("li::attr(data-attr-value)").extract()
        item["product_id"] = prod_id
        item["product_brand"] = response.css("a.product-brand::text").get()
        item["product_name"] = response.css("h1.product-name::text").get()
        json_breadcrumbs_text = (
            response.css('script[type="application/ld json"]')
            .extract()[-1]
            .replace('<script type="application/ld json">', "")
            .replace("</script>", "")
        )
        bc_json_blob = json.loads(json_breadcrumbs_text)
        item["categories"] = [
            {f"category_{idx}": cat["name"]}
            for idx, cat in enumerate(bc_json_blob["itemListElement"], 1)
        ]
        item["slug"] = json_blob["offers"]["url"].split(".com")[-1]
        desc = json_blob["description"]
        item["description"] = re.sub("<[^<] ?>", " ", html.unescape(desc))
        item["product_variants"] = []
        item["color"] = response.css(
            "span.text2.color-value.attribute-displayValue::text"
        ).get()
        item["sizes"] = []
        for color in colors:
            for i_size in sizes:
                variant_url = (
                    response.url
                      "?dwvar_"
                      prod_id
                      "_color="
                      color.upper()
                      "&dwvar_"
                      prod_id
                      f"_size={i_size}&pid="
                      prod_id
                )
                resp = requests.get(variant_url, headers=self.headers)
                product_variants = Selector(text=resp.text)
                size = "".join(
                    list(
                        filter(
                            None,
                            [
                                s.replace("\n", "")
                                for s in product_variants.css("li")
                                .css("[selected] ::text")
                                .extract()
                            ],
                        )
                    )
                )
                disabled = (
                    product_variants.css("li")
                    .css("[disabled]")
                    .css("[selected] ::text")
                    .getall()
                )
                final_price = ""
                final_price = product_variants.css(
                    "span.formatted_sale_price.formatted_price.js-final-sale-price.bfx-price.bfx-list-price::text"
                ).get()
                if final_price is None:
                    final_price = product_variants.css(
                        "span.formatted_sale_price.formatted_price.js-final-sale-price.bfx-price.bfx-sale-price::text"
                    ).get()
                try:
                    old_price = product_variants.css(
                        "span.formatted_price.bfx-price.bfx-list-price ::text"
                    ).get()
                except:
                    old_price = ""

                if not disabled:
                    item["product_variants"].append(
                        {
                            "color": product_variants.css(
                                "span.text2.color-value.attribute-displayValue::text"
                            ).get(),
                            "size": size,
                            "status": "AVAILABLE",
                            "final_price": final_price,
                            "old_price": old_price,
                        }
                    )
                else:
                    item["product_variants"].append(
                        {
                            "color": product_variants.css(
                                "span.text2.color-value.attribute-displayValue::text"
                            ).get(),
                            "size": size,
                            "status": "NOT_AVAILABLE",
                            "final_price": final_price,
                            "old_price": old_price,
                        }
                    )

        if item["product_variants"] == []:
            size_selector = response.css(
                "ul.radio-group-list.size-attribute.swatch-display-three.show-size-dropdown"
            )
            for s in size_selector.css("li"):
                all_size_var = s.css("::text").getall()
                if not s.css("[disabled]"):
                    available = all_size_var
                    clean = list(filter(None, [c.replace("\n", "") for c in available]))
                    for out_si in clean:
                        item["sizes"].append({"size": out_si, "status": "AVAILABLE"})
                else:
                    out_of_stock = all_size_var
                    clean = list(
                        filter(None, [c.replace("\n", "") for c in out_of_stock])
                    )
                    for in_si in clean:
                        item["sizes"].append({"size": in_si, "status": "NOT_AVAILABLE"})

        if item["product_variants"] == [] and item["sizes"] == []:
            if response.css("div.form-group.show-size-dropdown-holder"):
                size_dropdown = response.css(
                    "ul.radio-group-list.size-attribute.swatch-display-three ::text"
                ).extract()
                clean_sizes = list(
                    filter(None, [s.replace("\n", "") for s in size_dropdown])
                )

                for dd_si in clean_sizes:
                    variant_url = (
                        response.url
                          "?dwvar_"
                          prod_id
                          "_color="
                          item["color"].upper()
                          "&dwvar_"
                          prod_id
                          f"_size={dd_si}&pid="
                          prod_id
                    )
                    resp = requests.get(variant_url, headers=self.headers)
                    product_variants = Selector(text=resp.text)
                    size = "".join(
                        list(
                            filter(
                                None,
                                [
                                    s.replace("\n", "")
                                    for s in product_variants.css("li")
                                    .css("[selected] ::text")
                                    .extract()
                                ],
                            )
                        )
                    )
                    disabled = (
                        product_variants.css("li")
                        .css("[disabled]")
                        .css("[selected] ::text")
                        .getall()
                    )
                    final_price = ""
                    final_price = product_variants.css(
                        "span.formatted_sale_price.formatted_price.js-final-sale-price.bfx-price.bfx-list-price::text"
                    ).get()
                    if final_price is None:
                        final_price = product_variants.css(
                            "span.formatted_sale_price.formatted_price.js-final-sale-price.bfx-price.bfx-sale-price::text"
                        ).get()
                    try:
                        old_price = product_variants.css(
                            "span.formatted_price.bfx-price.bfx-list-price ::text"
                        ).get()
                    except:
                        old_price = ""

                    if not disabled:
                        item["product_variants"].append(
                            {
                                "color": item["color"],
                                "size": size,
                                "status": "AVAILABLE",
                                "final_price": final_price,
                                "old_price": old_price,
                            }
                        )
                    else:
                        item["product_variants"].append(
                            {
                                "color": item["color"],
                                "size": size,
                                "status": "NOT_AVAILABLE",
                                "final_price": final_price,
                                "old_price": old_price,
                            }
                        )

        item["gender"] = ""
        bc_li = [b["name"] for b in bc_json_blob["itemListElement"]]

        if "Women's Clothing" in bc_li:
            item["gender"] = "Female"
        elif "Men" in bc_li or "Men's" in bc_li:
            item["gender"] = "Male"
        else:
            item["gender"] = "Female"
        if (
            "Kids" in bc_li
            and any("Boys" in s for s in bc_li)
            or any("Baby Boy" in s for s in bc_li)
        ):
            item["gender"] = "Boy"
        elif (
            "Kids" in bc_li
            and any("Girls" in s for s in bc_li)
            or any("Baby Girl" in s for s in bc_li)
        ):
            item["gender"] = "Girl"

        elif (
            any("Kids" in s for s in bc_li)
            and not any("Baby Girl" in s for s in bc_li)
            and not any("Baby Boy" in s for s in bc_li)
            and not any("Boys" in s for s in bc_li)
            and not any("Girls" in s for s in bc_li)
        ):
            item["gender"] = "Kids"

        elif any("Accessories" in s for s in bc_li):
            item["gender"] = ""

        else:
            item["gender"] = ""

        price_json_text = (
            response.css('script[type="text/javascript"]')
            .extract()[2]
            .replace('<script type="text/javascript">\npageDataObj = ', "")
            .replace(";\n</script>", "")
        )
        price_json_blob = json.loads(price_json_text)
        item["tag"] = price_json_blob["products"][0]["tags"]["feature_type"]
        item["price"] = [
            {
                "original_price": p["original_price"],
                "price": p["price"],
                "currency": json_blob["offers"]["priceCurrency"],
            }
            for p in price_json_blob["products"]
        ]
        item["images"] = json_blob["image"]

        yield item

Please can anyone have any tips or suggestions to how to optimize the python requests in scrapy? Thanks in advance!

CodePudding user response：

As mentioned in the comments one area where performance gains can be made would be to eliminate the use of the requests lib and instead feed all of the additional requests for each of the product variants back into the scrapy engine.

You can do this by collecting all of the variant urls into a list and passing the list as an argument to callback methods, that exclusively parse the variant information from the variant page, and then recalling with the next variant url until there are none left.

I ran multiple tests using this strategy on various sized subsets of your requests and it consistently retrieved the same number of items as your code but in under half the time.

The code:

import scrapy
from urllib.parse import urlencode
import re
import json
import html


class SaksFifthAvenueSpider(scrapy.Spider):
    name = "safa-feeds"

    # custom settings
    custom_settings = {
        "LOG_FILE": "saks_fifth_avenue1.log",
        "ITEM_PIPELINES": {
            "sdt.pipelines.SdtWasabiS3Pipeline": 300,
        },
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
        "Accept": "text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "cross-site",
        "Sec-GPC": "1",
    }

    params = {
        "cgid": "",
        "start": "0",
        "sz": "24",
        "hideLess": "true",
    }

    base_url = "https://www.saksfifthavenue.com/on/demandware.store/Sites-SaksFifthAvenue-Site/en_US/Search-UpdateGrid?"

    def start_requests(self):
        cgid_list = [
            "2534374306418048",
            "2534374306624247",
            "2534374306622828",
            "1608313652004",
            "2534374306418050",
            "2534374306418162",
            "2534374306418054",
            "1654108780232",
            "2534374306418205",
            "2534374306418206",
            "2534374306418217",
            "2534374306418192",
            "1608313652004",
            "2534374306418053",
        ]
        for i, cgid in enumerate(cgid_list):
            self.params["cgid"] = cgid
            category_url = self.base_url   urlencode(self.params)
            yield scrapy.Request(
                url=category_url, headers=self.headers, callback=self.parse_page_items
            )

    def parse_page_items(self, response):
        item_links = set(
            [
                "https://www.saksfifthavenue.com"   u.split("?")[0]
                for u in response.css("a.thumb-link.mw-100::attr(href)").extract()
            ]
        )
        inner_load = response.css("div.show-more ::attr(data-url)").get()
        if inner_load:
            yield scrapy.Request(
                url=inner_load, headers=self.headers, callback=self.parse_page_items
            )
        # next_page_no = response.css('a[aria-label="Next"]::attr(href)').get()
        # if next_page_no:
        #     self.params["start"] = next_page_no.split("&")[0].split("=")[-1]
        #     next_url = self.base_url   urlencode(self.params)
        #     yield scrapy.Request(
        #         url=next_url, headers=self.headers, callback=self.parse_page_items
        #     )

        for i, link in enumerate(item_links):
            yield scrapy.Request(
                url=link, headers=self.headers, callback=self.parse_product_details
            )

    def parse_product_details(self, response):
        item = {}
        json_text = (
            response.css('script[type="application/ld json"]')
            .get()
            .replace('<script type="application/ld json">', "")
            .replace("</script>", "")
        )
        json_blob = json.loads(json_text)
        prod_id = response.css("div.container.product-detail::attr(data-pid)").get()
        colors = response.css("button::attr(data-adobelaunchproductcolor)").extract()
        sizes = response.css("li::attr(data-attr-value)").extract()
        item["product_id"] = prod_id
        item["product_brand"] = response.css("a.product-brand::text").get()
        item["product_name"] = response.css("h1.product-name::text").get()
        json_breadcrumbs_text = (
            response.css('script[type="application/ld json"]')
            .extract()[-1]
            .replace('<script type="application/ld json">', "")
            .replace("</script>", "")
        )
        bc_json_blob = json.loads(json_breadcrumbs_text)
        item["categories"] = [
            {f"category_{idx}": cat["name"]}
            for idx, cat in enumerate(bc_json_blob["itemListElement"], 1)
        ]
        item["slug"] = json_blob["offers"]["url"].split(".com")[-1]
        desc = json_blob["description"]
        item["description"] = re.sub("<[^<] ?>", " ", html.unescape(desc))
        item["product_variants"] = []
        item["color"] = response.css(
            "span.text2.color-value.attribute-displayValue::text"
        ).get()
        item["sizes"] = []
        item["gender"] = ""
        bc_li = [b["name"] for b in bc_json_blob["itemListElement"]]

        if "Women's Clothing" in bc_li:
            item["gender"] = "Female"
        elif "Men" in bc_li or "Men's" in bc_li:
            item["gender"] = "Male"
        else:
            item["gender"] = "Female"
        if (
            "Kids" in bc_li
            and any("Boys" in s for s in bc_li)
            or any("Baby Boy" in s for s in bc_li)
        ):
            item["gender"] = "Boy"
        elif (
            "Kids" in bc_li
            and any("Girls" in s for s in bc_li)
            or any("Baby Girl" in s for s in bc_li)
        ):
            item["gender"] = "Girl"

        elif (
            any("Kids" in s for s in bc_li)
            and not any("Baby Girl" in s for s in bc_li)
            and not any("Baby Boy" in s for s in bc_li)
            and not any("Boys" in s for s in bc_li)
            and not any("Girls" in s for s in bc_li)
        ):
            item["gender"] = "Kids"

        elif any("Accessories" in s for s in bc_li):
            item["gender"] = ""

        else:
            item["gender"] = ""

        price_json_text = (
            response.css('script[type="text/javascript"]')
            .extract()[2]
            .replace('<script type="text/javascript">\npageDataObj = ', "")
            .replace(";\n</script>", "")
        )
        price_json_blob = json.loads(price_json_text)
        item["tag"] = price_json_blob["products"][0]["tags"]["feature_type"]
        item["price"] = [
            {
                "original_price": p["original_price"],
                "price": p["price"],
                "currency": json_blob["offers"]["priceCurrency"],
            }
            for p in price_json_blob["products"]
        ]
        variant_urls = []
        for color in colors:
            for i_size in sizes:
                variant_url = (
                    response.url
                      "?dwvar_"
                      prod_id
                      "_color="
                      color.upper()
                      "&dwvar_"
                      prod_id
                      f"_size={i_size}&pid="
                      prod_id
                )
                variant_urls.append(variant_url)
        item["images"] = json_blob["image"]
        size_selector = response.css(
                "ul.radio-group-list.size-attribute.swatch-display-three.show-size-dropdown"
            )
        if size_selector:
            for s in size_selector.css("li"):
                all_size_var = s.css("::text").getall()
                if not s.css("[disabled]"):
                    available = all_size_var
                    clean = list(filter(None, [c.replace("\n", "") for c in available]))
                    for out_si in clean:
                        item["sizes"].append({"size": out_si, "status": "AVAILABLE"})
                else:
                    out_of_stock = all_size_var
                    clean = list(
                        filter(None, [c.replace("\n", "") for c in out_of_stock])
                    )
                    for in_si in clean:
                        item["sizes"].append({"size": in_si, "status": "NOT_AVAILABLE"})
        clean_sizes_urls = []
        if response.css("div.form-group.show-size-dropdown-holder"):
            size_dropdown = response.css(
                "ul.radio-group-list.size-attribute.swatch-display-three ::text"
            ).extract()
            clean_sizes = list(
                filter(None, [s.replace("\n", "") for s in size_dropdown])
            )
            for dd_si in clean_sizes:
                if item["color"]:
                    variant_url = (
                        response.url
                          "?dwvar_"
                          prod_id
                          "_color="
                          item["color"].upper()
                          "&dwvar_"
                          prod_id
                          f"_size={dd_si}&pid="
                          prod_id
                    )
                    clean_sizes_urls.append(variant_url)
        if variant_urls:
            yield scrapy.Request(variant_urls[0], headers=self.headers, cb_kwargs={"item": item, "variant_urls": variant_urls[1:], "clean_sizes_urls": clean_sizes_urls}, callback=self.parse_product_variants)
        elif clean_sizes_urls:
            yield scrapy.Request(clean_sizes_urls[0], headers=self.headers, cb_kwargs={"item": item, "clean_sizes_urls": clean_sizes_urls[1:]}, callback=self.parse_clean_sizes)
        else:
            yield item

    def parse_product_variants(
        self, response, item, variant_urls, clean_sizes_urls):
        size = "".join(
            list(
                filter(
                    None,
                    [
                        s.replace("\n", "")
                        for s in response.css("li")
                        .css("[selected] ::text")
                        .extract()
                    ],
                )
            )
        )
        disabled = (
            response.css("li")
            .css("[disabled]")
            .css("[selected] ::text")
            .getall()
        )
        final_price = ""
        final_price = response.css(
            "span.formatted_sale_price.formatted_price.js-final-sale-price.bfx-price.bfx-list-price::text"
        ).get()
        if final_price is None:
            final_price = response.css(
                "span.formatted_sale_price.formatted_price.js-final-sale-price.bfx-price.bfx-sale-price::text"
            ).get()
        try:
            old_price = response.css(
                "span.formatted_price.bfx-price.bfx-list-price ::text"
            ).get()
        except:
            old_price = ""

        if not disabled:
            item["product_variants"].append(
                {
                    "color": response.css(
                        "span.text2.color-value.attribute-displayValue::text"
                    ).get(),
                    "size": size,
                    "status": "AVAILABLE",
                    "final_price": final_price,
                    "old_price": old_price,
                }
            )
        else:
            item["product_variants"].append(
                {
                    "color": response.css(
                        "span.text2.color-value.attribute-displayValue::text"
                    ).get(),
                    "size": size,
                    "status": "NOT_AVAILABLE",
                    "final_price": final_price,
                    "old_price": old_price,
                }
            )
        if len(variant_urls) > 0:
            yield scrapy.Request(variant_urls[0], headers=self.headers, cb_kwargs={"item": item, "variant_urls": variant_urls[1:], "clean_sizes_urls": clean_sizes_urls}, callback=self.parse_product_variants)
        elif len(clean_sizes_urls) > 0:
            yield scrapy.Request(clean_sizes_urls[0], headers=self.headers, cb_kwargs={"item": item, "clean_sizes_urls": clean_sizes_urls[1:]}, callback=self.parse_clean_sizes)
        else:
            yield item


    def parse_clean_sizes(self, response, item, clean_sizes_urls):
        size = "".join(
            list(
                filter(
                    None,
                    [
                        s.replace("\n", "")
                        for s in response.css("li")
                        .css("[selected] ::text")
                        .extract()
                    ],
                )
            )
        )
        disabled = (
            response.css("li")
            .css("[disabled]")
            .css("[selected] ::text")
            .getall()
        )
        final_price = ""
        final_price = response.css(
            "span.formatted_sale_price.formatted_price.js-final-sale-price.bfx-price.bfx-list-price::text"
        ).get()
        if final_price is None:
            final_price = response.css(
                "span.formatted_sale_price.formatted_price.js-final-sale-price.bfx-price.bfx-sale-price::text"
            ).get()
        try:
            old_price = response.css(
                "span.formatted_price.bfx-price.bfx-list-price ::text"
            ).get()
        except:
            old_price = ""

        if not disabled:
            item["product_variants"].append(
                {
                    "color": item["color"],
                    "size": size,
                    "status": "AVAILABLE",
                    "final_price": final_price,
                    "old_price": old_price,
                }
            )
        else:
            item["product_variants"].append(
                {
                    "color": item["color"],
                    "size": size,
                    "status": "NOT_AVAILABLE",
                    "final_price": final_price,
                    "old_price": old_price,
                }
            )
        if len(clean_sizes_urls) > 0:
            yield scrapy.Request(clean_sizes_urls[0], headers=self.headers, cb_kwargs={"item": item, "clean_sizes_urls": clean_sizes_urls[1:]}, callback=self.parse_clean_sizes)
        else:
            yield item

Note: It has not been tested with your pipeline