I'm relatively new to Scrapy and I'm trying to scrape a website with lots of data. The initial URL is a search page with no filtering that displays the total number of search results, the initial page of results (24 items), and links to the next 49 pages. The problem is that, even if the total number of results cannot fit in 50 pages, it's not possible to access the 51st page. The only way to access all items, therefore, is by filtering the results to make sure there are aways fewer than 1200 items (50 * 24). In order to do this, we implemented a recursive price filtering logic. If there are too many results in the unfiltered search, we do two new filtered searches: one with price range p=min:20000 (where min is the price of the cheapest car), and another with price range p=20001: (after that, the middle point is calculated as (min + max) // 2).
The logic works fine, but this website is very large and we need complex proxies to access it. If we decrease concurrency via Scrapy settings, the scraper works very slowly. If we increase it too much, we start getting 502 responses. Basically we need to find a trade off between speed and response success, but we cannot guarantee that there will be no 502 responses. The problem is, if we start processing individual car ads while the price filtering logic is still happening, and we start getting 502 at this stage, there is a risk that we will miss entire price ranges.
So my question is: how can I make sure the price filtering requests are completely done before I start processing the individual pages and car ads? I've tried prioritizing them differently but it simply doesn't work. The scraper starts processing items long before the price filtering requests are done. Ideally I would like ALL price range requests to be finished first, then all page requests, then all individual car ad requests. Here's a simplified version of my scraper:
import math
import scrapy
from furl import furl
from scrapy.loader import ItemLoader
from ..items import AdItem
class CarSpider(scrapy.Spider):
name = "car_spider"
allowed_domains = ["cars.com"]
website = "cars"
url = "https://cars.com"
item_class = AdItem
cars_per_page = 24
max_pages = 50
max_cars = cars_per_page * max_pages # 1200
price_increment = 20000
def _get_url(self, params):
url_obj = furl(self.url).add(params)
return url_obj.url
def start_requests(self):
"""Constructs the list of initial links to scrape."""
params = {
"sb": "p", # Order by price
"od": "up", # Lowest first
}
yield scrapy.Request(
url=self._get_url(params),
dont_filter=True,
callback=self.parse,
)
def parse(self, response):
"""
Parse search results and narrow them down recursively if necessary.
We assume there will never be more than 1200 cars with the same price.
"""
if not hasattr(self, "lowest_price"):
# This is a recursive function, so we have to make sure the lowest
# price is calculated in the first iteration only.
is_first_iteration = True
self.page_requests = []
self.total_cars = int(
response.xpath("//h1[@data-testid='srp-title']/text()").get()
) # The total number of cars in the full search
self.total_pages = math.ceil(self.total_cars / self.cars_per_page)
self.logger.info(f"Total cars: {self.total_cars}")
self.crawler.stats.set_value("ads_count", self.total_cars)
prices = [
int(price)
for price in response.xpath(
"//span[@data-testid='price-label']/text()"
).getall()
]
self.lowest_price = min(prices)
min_price = self.lowest_price
mid_price = min_price + self.price_increment // 2
max_price = min_price + self.price_increment
else:
is_first_iteration = False
min_price = response.meta["min_price"]
max_price = response.meta.get("max_price", "")
if max_price:
mid_price = (min_price + max_price) // 2
else:
mid_price = min_price + self.price_increment
total_cars = int(
response.xpath("//h1[@data-testid='srp-title']/text()").get()
) # The total number of cars in the current price range
if total_cars > self.max_cars:
if is_first_iteration:
self.logger.info(
f"Too many cars ({total_cars}) without price filtering, "
"narrowing down..."
)
else:
self.logger.info(
f"Too many cars ({total_cars}) in {min_price}:{max_price} "
"price range, narrowing down..."
)
for price_range in [
(min_price, mid_price),
(mid_price + 1, max_price),
]:
min_price, max_price = price_range
params = {
"p": f"{min_price}:{max_price}",
}
yield scrapy.Request(
url=self._get_url(params),
dont_filter=True,
callback=self.parse,
meta={
"min_price": min_price,
"max_price": max_price,
},
)
elif total_cars:
self.logger.info(
f"Good number of cars ({total_cars}) in "
f"{min_price}:{max_price} price range, saving pages..."
)
# Parse first page
self.parse_list(response)
# Parse the rest of the pages
total_pages = math.ceil(total_cars / self.cars_per_page)
for page in range(2, total_pages + 1):
params = {
"pageNumber": page,
"p": f"{min_price}:{max_price}",
}
yield scrapy.Request(
url=self._get_url(params),
dont_filter=True,
callback=self.parse_list,
priority=-1,
)
def parse_list(self, response):
"""Parses a page of ads and generates requests for every ad."""
links = response.xpath(
"//a[contains(@data-testid, 'result-listing-')]/@href"
).getall()
for link in links:
yield response.follow(
url=link,
dont_filter=True,
callback=self.parse_car,
priority=-2,
)
def parse_car(self, response):
loader = ItemLoader(item=self.item_class(), response=response)
# Populate the loader with the data we need
yield loader.load_item()