How can I control request order on Scrapy?

19 Views Asked by At

I'm relatively new to Scrapy and I'm trying to scrape a website with lots of data. The initial URL is a search page with no filtering that displays the total number of search results, the initial page of results (24 items), and links to the next 49 pages. The problem is that, even if the total number of results cannot fit in 50 pages, it's not possible to access the 51st page. The only way to access all items, therefore, is by filtering the results to make sure there are aways fewer than 1200 items (50 * 24). In order to do this, we implemented a recursive price filtering logic. If there are too many results in the unfiltered search, we do two new filtered searches: one with price range p=min:20000 (where min is the price of the cheapest car), and another with price range p=20001: (after that, the middle point is calculated as (min + max) // 2).

The logic works fine, but this website is very large and we need complex proxies to access it. If we decrease concurrency via Scrapy settings, the scraper works very slowly. If we increase it too much, we start getting 502 responses. Basically we need to find a trade off between speed and response success, but we cannot guarantee that there will be no 502 responses. The problem is, if we start processing individual car ads while the price filtering logic is still happening, and we start getting 502 at this stage, there is a risk that we will miss entire price ranges.

So my question is: how can I make sure the price filtering requests are completely done before I start processing the individual pages and car ads? I've tried prioritizing them differently but it simply doesn't work. The scraper starts processing items long before the price filtering requests are done. Ideally I would like ALL price range requests to be finished first, then all page requests, then all individual car ad requests. Here's a simplified version of my scraper:

import math

import scrapy
from furl import furl
from scrapy.loader import ItemLoader

from ..items import AdItem


class CarSpider(scrapy.Spider):

    name = "car_spider"

    allowed_domains = ["cars.com"]

    website = "cars"

    url = "https://cars.com"

    item_class = AdItem

    cars_per_page = 24
    max_pages = 50
    max_cars = cars_per_page * max_pages  # 1200

    price_increment = 20000

    def _get_url(self, params):

        url_obj = furl(self.url).add(params)

        return url_obj.url

    def start_requests(self):
        """Constructs the list of initial links to scrape."""

        params = {
            "sb": "p",  # Order by price
            "od": "up",  # Lowest first
        }

        yield scrapy.Request(
            url=self._get_url(params),
            dont_filter=True,
            callback=self.parse,
        )

    def parse(self, response):
        """
        Parse search results and narrow them down recursively if necessary.

        We assume there will never be more than 1200 cars with the same price.
        """

        if not hasattr(self, "lowest_price"):
            # This is a recursive function, so we have to make sure the lowest
            # price is calculated in the first iteration only.
            is_first_iteration = True
            self.page_requests = []
            self.total_cars = int(
                response.xpath("//h1[@data-testid='srp-title']/text()").get()
            )  # The total number of cars in the full search
            self.total_pages = math.ceil(self.total_cars / self.cars_per_page)
            self.logger.info(f"Total cars: {self.total_cars}")
            self.crawler.stats.set_value("ads_count", self.total_cars)

            prices = [
                int(price)
                for price in response.xpath(
                    "//span[@data-testid='price-label']/text()"
                ).getall()
            ]
            self.lowest_price = min(prices)
            min_price = self.lowest_price
            mid_price = min_price + self.price_increment // 2
            max_price = min_price + self.price_increment

        else:
            is_first_iteration = False
            min_price = response.meta["min_price"]
            max_price = response.meta.get("max_price", "")
            if max_price:
                mid_price = (min_price + max_price) // 2
            else:
                mid_price = min_price + self.price_increment

        total_cars = int(
            response.xpath("//h1[@data-testid='srp-title']/text()").get()
        )  # The total number of cars in the current price range

        if total_cars > self.max_cars:
            if is_first_iteration:
                self.logger.info(
                    f"Too many cars ({total_cars}) without price filtering, "
                    "narrowing down..."
                )
            else:
                self.logger.info(
                    f"Too many cars ({total_cars}) in {min_price}:{max_price} "
                    "price range, narrowing down..."
                )

            for price_range in [
                (min_price, mid_price),
                (mid_price + 1, max_price),
            ]:
                min_price, max_price = price_range
                params = {
                    "p": f"{min_price}:{max_price}",
                }
                yield scrapy.Request(
                    url=self._get_url(params),
                    dont_filter=True,
                    callback=self.parse,
                    meta={
                        "min_price": min_price,
                        "max_price": max_price,
                    },
                )
        elif total_cars:
            self.logger.info(
                f"Good number of cars ({total_cars}) in "
                f"{min_price}:{max_price} price range, saving pages..."
            )
            # Parse first page
            self.parse_list(response)
            # Parse the rest of the pages
            total_pages = math.ceil(total_cars / self.cars_per_page)
            for page in range(2, total_pages + 1):
                params = {
                    "pageNumber": page,
                    "p": f"{min_price}:{max_price}",
                }
                yield scrapy.Request(
                    url=self._get_url(params),
                    dont_filter=True,
                    callback=self.parse_list,
                    priority=-1,
                )

    def parse_list(self, response):
        """Parses a page of ads and generates requests for every ad."""
        links = response.xpath(
            "//a[contains(@data-testid, 'result-listing-')]/@href"
        ).getall()
        for link in links:
            yield response.follow(
                url=link,
                dont_filter=True,
                callback=self.parse_car,
                priority=-2,
            )

    def parse_car(self, response):
        loader = ItemLoader(item=self.item_class(), response=response)
        # Populate the loader with the data we need
        yield loader.load_item()
0

There are 0 best solutions below