I was trying to download file via scrapy-playwright, but for some reason download of file fails. On URL there is pdf file that I want to download, and I can see from logs that download is started, but it is interrupted. I am not sure why is that happening since error is not clear enough.
import scrapy
from scrapy_playwright.page import PageMethod
from playwright.async_api import Dialog
import logging
class AwesomeSpider(scrapy.Spider):
name = "awesome"
def start_requests(self):
# GET request
yield scrapy.Request(
url="https://www.fiat.rs/content/dam/fiat/rs/cenovnici-i-katalozi/cenovnik-fiat-500.pdf",
meta=dict(
playwright=True,
playwright_include_page = True,
playwright_page_event_handlers = {
"download": self.handle_download
},
playwright_page_methods={
"expect_download": PageMethod("expect_download", predicate=self.is_download_done, timeout=120000),
},
),
errback=self.errback,
)
def is_download_done(self, download):
logging.info(f"Is download done: {download.is_done()}")
return download.is_done()
async def handle_download(self, download) -> None:
print("File download started")
print(f"File: {download.suggested_filename}")
await download.save_as("~/projects/nenad/temp/" + download.suggested_filename)
print(f"Received file with path {await download.path()}")
async def parse(self, response):
# 'response' contains the page as seen by the browser
logging.info("Parsing response")
page = response.meta["playwright_page"]
file = await page.expect_download(timeout=120000)
# screenshot contains the image's bytes
await page.close()
return {"file": file}
async def errback(self, failure):
logging.info(
"Handling failure in errback, request=%r, exception=%r", failure.request, failure.value
)
logging.info(failure)
page = failure.request.meta["playwright_page"]
await page.close()
await page.context.close()
Error logs, that are showing the error can be found here:
2023-10-16 20:37:01 [scrapy-playwright] INFO: Launching browser chromium
2023-10-16 20:37:01 [scrapy-playwright] INFO: Browser chromium launched
2023-10-16 20:37:01 [scrapy-playwright] DEBUG: Browser context started: 'default' (persistent=False, remote=False)
2023-10-16 20:37:02 [scrapy-playwright] DEBUG: [Context=default] New page created, page count is 1 (1 for all contexts)
2023-10-16 20:37:02 [scrapy-playwright] DEBUG: [Context=default] Request: <GET https://www.fiat.rs/content/dam/fiat/rs/cenovnici-i-katalozi/cenovnik-fiat-500.pdf> (resource type: document)
2023-10-16 20:37:02 [scrapy-playwright] DEBUG: [Context=default] Response: <200 https://www.fiat.rs/content/dam/fiat/rs/cenovnici-i-katalozi/cenovnik-fiat-500.pdf>
File download started
File: cenovnik-fiat-500.pdf
2023-10-16 20:37:02 [root] INFO: Handling failure in errback, request=<GET https://www.fiat.rs/content/dam/fiat/rs/cenovnici-i-katalozi/cenovnik-fiat-500.pdf>, exception=Error('net::ERR_ABORTED at https://www.fiat.rs/content/dam/fiat/rs/cenovnici-i-katalozi/cenovnik-fiat-500.pdf\n=========================== logs ===========================\nnavigating to "https://www.fiat.rs/content/dam/fiat/rs/cenovnici-i-katalozi/cenovnik-fiat-500.pdf", waiting until "load"\n============================================================')
2023-10-16 20:37:02 [root] INFO: [Failure instance: Traceback: <class 'playwright._impl._api_types.Error'>: net::ERR_ABORTED at https://www.fiat.rs/content/dam/fiat/rs/cenovnici-i-katalozi/cenovnik-fiat-500.pdf
=========================== logs ===========================
navigating to "https://www.fiat.rs/content/dam/fiat/rs/cenovnici-i-katalozi/cenovnik-fiat-500.pdf", waiting until "load"
============================================================
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/twisted/internet/defer.py:735:errback
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/twisted/internet/defer.py:798:_startRunCallbacks
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/twisted/internet/defer.py:892:_runCallbacks
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/twisted/internet/defer.py:1792:gotResult
--- <exception caught here> ---
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/twisted/internet/defer.py:1693:_inlineCallbacks
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/twisted/python/failure.py:518:throwExceptionIntoGenerator
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/scrapy/core/downloader/middleware.py:54:process_request
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/twisted/internet/defer.py:1065:adapt
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/scrapy_playwright/handler.py:322:_download_request
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/scrapy_playwright/handler.py:357:_download_request_with_page
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/playwright/async_api/_generated.py:9251:goto
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/playwright/_impl/_page.py:473:goto
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/playwright/_impl/_frame.py:138:goto
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/playwright/_impl/_connection.py:61:send
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/playwright/_impl/_connection.py:482:wrap_api_call
/home/mrav/.local/share/virtualenvs/scrapy_google-izqaQN_l/lib/python3.8/site-packages/playwright/_impl/_connection.py:97:inner_send
]
Received file with path /tmp/playwright-artifacts-Of01oV/40467222-8f9f-4c24-9e67-348d60a34a68
2023-10-16 20:37:02 [scrapy-playwright] DEBUG: Browser context closed: 'default' (persistent=False, remote=False)
2023-10-16 20:37:03 [scrapy.core.engine] INFO: Closing spider (finished)
It seems to me that download of file is started, but for some reason scraper do not wait for download to be finished.
In settings.py I have additional configuration for Playwright:
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
PLAYWRIGHT_BROWSER_TYPE = "chromium"
Any suggestion what could be wrong is more then welcomed.