I try to run two spiders behind company proxy. One spider is scrapy spider ("quotes") and one scrapy-playwright spider(quotes-playwright):
from typing import Iterable
import os
import scrapy
from scrapy.http import Request
from macro_scrapy.items import QuoteItem
from scrapy_playwright.page import PageMethod
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/'
]
def parse(self, response):
for quote in response.css("div.quote"):
yield {
'text': quote.css("span.text::text").extract_first().replace("\u201c","").replace("\u201d",""),
'author': quote.css("small.author::text").extract_first(),
'tags': quote.css("div.tags a.tag::text").extract(),
}
class QuotesScrollSpider(scrapy.Spider):
name = 'quotesscroll'
def start_requests(self):
url ="http://quotes.toscrape.com/scroll"
yield scrapy.Request(url, meta=dict(
playwright= True,
playwright_include_page= True,
playwright_page_methods= [
PageMethod("wait_for_selector", "div.quote"),
PageMethod("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
PageMethod("wait_for_selector", "div.quote:nth-child(15)"),
],
errBack=self.handle_failure
))
async def parse(self, response):
page = response.meta["playwright_page"]
screeshot= await page.screenshot(path="screenshot.png", full_page=True)
await page.close()
async def handle_failure(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()
I also have this settings in settings.py:
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
PLAYWRIGHT_LAUNCH_OPTIONS = {
"headless": True,
}
PLAYWRIGHT_LAUNCH_OPTIONS = {
"proxy": {
"server": os.environ.get("SERVER"),
"username": os.environ.get("USERNAME"),
"password": os.environ.get("PASSWORD"),
},
}
I have SERVER, USERNAME and PASSWORD in environment variables and scrapy-playwright uses them andscrapy crawl quotesscroll works. However I need to add HTTP_PROXY and HTTPS_PROXY as environment variables in order to run scrapy crawl quotes(I do not need to load them explicitly through os.environ.get()). But if I add them the scrapy-playwright (quotesscroll) stops working. I tried to run scrapy crawl quotesscroll in different terminal of vscode and as there was not HTTP_PROXY assigned, it worked again.
Is there a way to run both in one terminal?