Hello everyone and thanks in advance for the help.
When running the scrapper below locally everything goest well and I get the expected value :
2023-10-08 00:15:41 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.apc.fr/chemise-clement-kaa-coges-h12512.html>
{'productname': 'Chemise Clément', 'gender': 'men', 'cleancategories': 'shirts', 'price': '180', 'color': 'VERT', 'picture_list': ['https://www.apc.fr/media/catalog/product/attribute/swatches_color/COGES_IAJ.jpg', 'https://www.apc.fr/media/catalog/product/attribute/swatches_color/COGES_KAA.jpg', 'https://www.apc.fr/media/catalog/product/cache/5f20f1917254e6a5a23af6773e8ed099/c/o/coges-h12512kaa_02_1684770825.jpg', 'https://www.apc.fr/media/catalog/product/cache/5f20f1917254e6a5a23af6773e8ed099/c/o/coges-h12512kaa_03_1684770825.jpg', 'https://www.apc.fr/media/catalog/product/cache/5f20f1917254e6a5a23af6773e8ed099/c/o/coges-h12512kaa_04_1684770825.jpg'], 'current_url': 'https://www.apc.fr/chemise-clement-kaa-coges-h12512.html'}
However, when I am pushing this on Apify, the function parse is run properly but stop here and do not move to the second function aaa_products:
[apify] INFO TitleSpider is parsaaaaing <200 https://apify.com>...
[apify] INFO TitleSpider is parsaaaaing <200 https://www.apc.fr/men/men-shirts.html>...
[apify] INFO TitleSpider is parsaaaaing <200 https://www.apc.fr/chemise-clement-kaa-coevd-h12512.html>...
[apify] INFO TitleSpider is parsaaaaing <200 https://www.apc.fr/surchemise-basile-pik-woapq-h02709.html>...
[apify] INFO TitleSpider is parsaaaaing <200 https://www.apc.fr/chemise-greg-iaa-coguh-h12499.html>...
Any idea of what could block it? Thanks
The code here :
from typing import Generator
from scrapy.responsetypes import Response
from apify import Actor
from urllib.parse import urljoin
import nest_asyncio
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.utils.reactor import install_reactor
class TitleSpider(scrapy.Spider):
name = 'title_spider'
def start_requests(self):
allowed_domains = ['apc.fr']
urls = [
"https://www.apc.fr/men/men-shirts.html"
]
for url in urls:
yield scrapy.Request(dont_filter=True, url=url, callback=self.parse,)
def parse(self, response: Response):
Actor.log.info(f'TitleSpider is parsaaaaing {response}...')
li_elements = response.css('li.product-item')
for li in li_elements:
productlink_container = li.css('.product-link')
product_links = productlink_container.css('a::attr(href)').getall()
for link in product_links:
yield scrapy.Request(dont_filter=True, url=link, callback=self.aaa_products,)
def aaa_products(self, response: Response):
Actor.log.info(f'machin fait nimp {response}...')
productname = response.css('h1.product-name::text').get()
current_url = response.url
productdescriptionfirst = response.css('div.product.attribute.intro')
productdescriptionsecond = productdescriptionfirst.css('div.value')
productdescription = productdescriptionsecond.css('::text').get()
pricecontainer1 = response.css('div.product-price-wrapper')
pricecontainer2 = pricecontainer1.css('div.price-final_price')
pricecontainer3 = pricecontainer2.css('span.normal-price')
pricecontainer4 = pricecontainer3.css('span[id^="product-price-"]')
price = pricecontainer4.css('::attr(data-price-amount)').get()
picturecointainer = response.css('div.product-gallery')
imagecontainer = picturecointainer.css('div.picture')
imagecontainer2 = imagecontainer.css('picture')
img_elements = response.css('img[data-src]')
referer_url = response.request.headers.get('Referer', None).decode('utf-8')
split_url = referer_url.split("/")
gender = split_url[-1].split("-")[0]
last_part = split_url[-1]
categorie = last_part.split("-")[1:]
joinedcategories = "-".join(categorie)
cleancategories = joinedcategories.replace(".html", "")
colorcontainer = response.css('div.product-colors')
colorcontainer2 = response.css('ul.colors-list')
colorcontainer3 = response.css('li.current-color-label')
color = colorcontainer3.css('::text').get()
picture_list = []
for img_element in img_elements:
source_element = img_element.css('img::attr(data-src)').get()
picture_list.append(source_element.strip())
yield {
'productname': productname,
'gender': gender,
'cleancategories': cleancategories,
'price': price,
'color': color,
'picture_list': picture_list,
'current_url': current_url,
}
So after discussing with Apify support, it's a bug on their side and it will be fixed in the next sprint. –