Scraping old prince and new price from a website that structures products in two different ways

41 Views Asked by At

Who can help?

Do not manage to match the correct prices old price and new price to the correct lines in terms of weight and color.

So far I have this code that first collects the product link in the file douglas_data.json and then assembles the correct variant links based on old available article numbers and then stores this in the variant_links.json with some information already. Unfortunately I come to the price assignments what color of the product and quantity of the product at the respective prices. It should be noted that the products on this website are displayed in two different ways.

To see here: Single product with only quantity information Example: https://www.douglas.de/de/p/2001002295?variant=192308

Product with quantity and color variant Example: https://www.douglas.de/de/p/3000062969?variant=730824

Anyone can help me with this, my code so far is:

import scrapy
import json
from urllib.parse import urljoin, urlparse, parse_qs
from scrapy_splash import SplashRequest
from scrapy.crawler import CrawlerProcess


class DouglasspiderPySpider(scrapy.Spider):
    name = 'douglasspider'
    start_urls = [
        'https://www.douglas.de/de/c/parfum/damenduefte/parfum/010106/',
        'https://www.douglas.de/de/c/make-up/lippen/lippenbalsam/030206/'
    ]
    json_file = 'douglas_data.json'
    variants_json_file = 'varianten_links.json'

    def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url=url, callback=self.parse_links, args={'wait': 2}, meta={'filename': self.json_file})

    def parse_links(self, response):
        product_links = response.css(
            '.link.link--no-decoration.product-tile__main-link::attr(href)').getall()
        filename = response.meta['filename']
        absolute_links = [urljoin(response.url, link)
                          for link in product_links]

        with open(filename, 'a', encoding='utf-8') as file:
            for link in absolute_links:
                parsed_url = urlparse(link)
                if parsed_url.query and 'variant' in parse_qs(parsed_url.query):
                    link = link.split('?')[0]

                data = {
                    'link': link,
                }
                json.dump(data, file, ensure_ascii=False)
                file.write('\n')

                yield SplashRequest(url=link, callback=self.parse_product_details, args={'wait': 2})

        self.log(f'Produkt-URLs wurden zu {filename} hinzugefügt')

        next_page_button = response.css(
            'a.pagination__arrow:nth-child(3)::attr(href)').get()
        if next_page_button:
            next_page_url = urljoin(response.url, next_page_button)
            yield SplashRequest(url=next_page_url, callback=self.parse_links, args={'wait': 2}, meta={'filename': filename})

    def parse_product_details(self, response):
        item_no_weight_product = response.css(
            'label.radio-group__button-icon.radio-group__button-icon--hidden::attr(for)').getall()
        item_no_color_product = response.css(
            '[data-testid="variant-blobs-scrollable-blob"]::attr(data-code)').getall()
        item_no = response.css(
            '.product-detail-info__classifications > div:nth-child(1) > span:nth-child(2)::text').getall()

        new_price = response.css(
            '.product-price__price::text').get()

        data = {
            'link': response.url,
            'item_No_weight_product': item_no_weight_product,
            'item_No_color_product': item_no_color_product,
            'item_No': item_no,
            'new_price': new_price
        }

        with open(self.json_file, 'a', encoding='utf-8') as file:
            json.dump(data, file, ensure_ascii=False)
            file.write('\n')

        variant_links = []

        for item_no_weight in item_no_weight_product:
            variant_links.append(f'{response.url}?variant={item_no_weight}')
        for item_no_color in item_no_color_product:
            variant_links.append(f'{response.url}?variant={item_no_color}')
        for item_no_single in item_no:
            variant_links.append(f'{response.url}?variant={item_no_single}')

        with open(self.variants_json_file, 'a', encoding='utf-8') as variant_file:
            for variant_link in variant_links:
                variant_data = {
                    'link': variant_link,
                    'item_No': parse_qs(urlparse(variant_link).query).get('variant', [None])[0],
                    'brand_name': response.css('.brand-logo__text::text').get(),
                    'product_name': ' '.join(filter(None, [response.css('.brand-line::text').get(), response.css('.header-name::text').get()])),
                    'category_path': ' > '.join(response.css('span.breadcrumb__entry > a::text').getall()),
                    'new_price': new_price
                }
                json.dump(variant_data, variant_file, ensure_ascii=False)
                variant_file.write('\n')

        self.remove_duplicate_links(self.variants_json_file)

    def remove_duplicate_links(self, json_file):
        with open(json_file, 'r', encoding='utf-8') as file:
            data = [json.loads(line) for line in file]

        seen_links = set()
        new_data = []
        duplicate_links = set()
        for entry in data:
            link = entry.get('link')
            if link in seen_links:
                duplicate_links.add(link)
            seen_links.add(link)
            new_data.append(entry)

        with open(json_file, 'w', encoding='utf-8') as file:
            for entry in new_data:
                json.dump(entry, file, ensure_ascii=False)
                file.write('\n')

        if duplicate_links:
            self.keep_one_duplicate(json_file, duplicate_links)

    def keep_one_duplicate(self, json_file, duplicate_links):
        with open(json_file, 'r', encoding='utf-8') as file:
            data = [json.loads(line) for line in file]

        new_data = []
        for entry in data:
            link = entry.get('link')
            if link in duplicate_links:
                new_data.append(entry)
                duplicate_links.remove(link)
            else:
                new_data.append(entry)

        with open(json_file, 'w', encoding='utf-8') as file:
            for entry in new_data:
                json.dump(entry, file, ensure_ascii=False)
                file.write('\n')


if __name__ == "__main__":
    process = CrawlerProcess()
    process.crawl(DouglasspiderPySpider)
    process.start()


enter image description here enter image description here

0

There are 0 best solutions below