How to Scrape Product Pages using Python grequests and BeautifulSoup

256 Views Asked by At
from bs4 import BeautifulSoup
import grequests
import pandas as pd
    
# STEP 1: Create List of URLs from main archive page
def get_urls():
    urls = []
    for x in range(1,3):
        urls.append(f'http://books.toscrape.com/catalogue/page-{x}.html')
        print(f'Getting page url: {x}', urls)
    return urls

# STEP 2: Async Load HTML Content from page range in step 1
def get_data(urls):
    reqs = [grequests.get(link) for link in urls]
    print('AsyncRequest object > reqs:', reqs)
    resp = grequests.map(reqs)
    print('Status Code > resp (info on page):', resp, '\n')
    return resp

# Step 3: Extract title, author, date, url, thumb from asynch variable resp containing html elements of all scraped pages.
def parse(resp):
    productlist = []

    for r in resp:
        #print(r.request.url)
        sp = BeautifulSoup(r.text, 'lxml')
        items = sp.find_all('article', {'class': 'product_pod'})
        #print('Items:\n', items)

        for item in items:
            product = {
            'title' : item.find('h3').text.strip(),
            'price': item.find('p', {'class': 'price_color'}).text.strip(),
            'single_url': 'https://books.toscrape.com/catalogue/' + item.find(('a')).attrs['href'],
            'thumbnail': 'https://books.toscrape.com/' + item.find('img', {'class': 'thumbnail'}).attrs['src'],
            }
            productlist.append(product)
            print('Added: ', product)
            
    return productlist

urls = get_urls() # (Step 1)
resp = get_data(urls) # (Step 2)
df = pd.DataFrame(parse(resp)) # (Step 3)
df.to_csv('books.csv', index=False)

The above script works as expected by asynchronously scraping the main archive page or pages for the website https://books.toscrape.com/ using grequests and BeautifulSoup.

Within the archive page it extracts the following book information:

  • title
  • price
  • single product url
  • thumbnail url

Issue

I need a way to further extract information from the single product pages for information such as UPC and associate the information back to the main array productlist.

Single Product Page Example: https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html

1

There are 1 best solutions below

6
Arslan Aziz On

The single page information you need UPC Product Type ,reviews etc... `

import requests
from bs4 import BeautifulSoup
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
r = requests.get("https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html?")
soup = BeautifulSoup(r.content, "lxml")
table = soup.find("article", class_="product_page")

header = [th.get_text(strip=True) for th in table.tr.select("th")][1:]
header.insert(0, 'S.No')

all_data = []
for row in table.select("tr:has(td)"):
    tds = [td.get_text(strip=True) for td in row.select("td")]
    all_data.append(tds)

df = pd.DataFrame(all_data, columns=header)
print(df)

output:
                      S.No
0         a897fe39b1053632
1                    Books
2                   �51.77
3                   �51.77
4                    �0.00
5  In stock (22 available)
6                        0