How do I resume where I left off when web-scraping through python?

87 Views Asked by Bryant Larson At 25 June 2023 at 09:33

I'm new to web scraping.

So far, I've gotten the data to download onto my computer perfectly fine, but since we're downloading so much data, sometimes there is a network error or my computer shuts off.

I tried installing a checkpoint file, but it never starts from where I need it to. Instead, it just goes back to the beginning.

Is there any way to ensure that Python resumes downloading where it left off? I've tried everything, and I can't seem to find the answer.

Here is the coding sequence:

import requests
from bs4 import BeautifulSoup
import os

def save_table_data(table, filename):
    rows = table.find_all('tr')
    with open(filename, 'w') as file:
        for row in rows:
            cells = row.find_all(['th', 'td'])
            cell_data = [cell.get_text(strip=True) for cell in cells]
            file.write('\t'.join(cell_data) + '\n')

def download_tables(url, path, checkpoint=None):
    try:
        response = session.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while accessing {url}: {str(e)}")
        return

    linked_page_soup = BeautifulSoup(response.content, 'html.parser')
    linked_tables = linked_page_soup.find_all('div', {'class': 'table-responsive'}) + \
                    linked_page_soup.find_all('table', {'class': 'table table-striped table-bordered'})

    for index, linked_table in enumerate(linked_tables):
        if checkpoint and index < checkpoint[0]:
            continue

        filename = os.path.join(path, f'data_{index + 1}.txt')
        save_table_data(linked_table, filename)
        print(f"{filename} downloaded successfully.")

        # Extract URLs from the table rows
        rows = linked_table.find_all('tr')
        urls = []
        for row in rows:
            link = row.find('a', href=True)
            if link:
                urls.append(link['href'])

        for sub_index, sub_url in enumerate(urls):
            if checkpoint and index == checkpoint[0] and sub_index < checkpoint[1]:
                continue

            sub_path = os.path.join(path, f'sub_{index + 1}_{sub_index + 1}')
            os.makedirs(sub_path, exist_ok=True)
            download_tables(sub_url, sub_path, checkpoint=(index, sub_index))

def save_checkpoint(checkpoint):
    with open(checkpoint_file, 'w') as file:
        if checkpoint:
            file.write(checkpoint)
        else:
            file.write("0,0")

# Replace with your actual credentials
username = 'ministry'
password = 'ministry@2022'

# Set up the session
session = requests.Session()

# Log in to the website
login_url = 'https://tricorniotec.com/webapp/portal'
login_data = {
    'username': username,
    'password': password
}
try:
    response = session.post(login_url, data=login_data)
    response.raise_for_status()
except requests.exceptions.RequestException as e:
    print(f"Error occurred during login: {str(e)}")
    exit()

# Access the dashboard
dashboard_url = 'https://tricorniotec.com/webapp/ministry/dashboard'
try:
    response = session.get(dashboard_url)
    response.raise_for_status()
except requests.exceptions.RequestException as e:
    print(f"Error occurred while accessing the dashboard: {str(e)}")
    exit()

# Parse the content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table with the class "table-responsive"
table = soup.find('div', {'class': 'table-responsive'})

# Extract all table rows
rows = table.find_all('tr')

# Extract URLs from the table rows
urls = []
for row in rows:
    link = row.find('a', href=True)
    if link:
        urls.append(link['href'])

# Check if there is a checkpoint file
checkpoint_file = '/path/to/your/checkpoint_file.txt'
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'r') as file:
        try:
            checkpoint = file.read().strip()
        except ValueError:
            print("Invalid checkpoint file format. Starting from the beginning.")
            checkpoint = None
else:
    checkpoint = None

# Set the flag to indicate if the checkpoint has been reached
reached_checkpoint = False

# Visit each URL and download the table data
for index, url in enumerate(urls):
    path = f'data_1/sub_1_3/sub_1_37/sub_1_95/data_21'
    os.makedirs(path, exist_ok=True)
    
    if checkpoint and not reached_checkpoint:
        if checkpoint == path:
            reached_checkpoint = True
        continue

    download_tables(url, path)

    if reached_checkpoint:
        # Save the checkpoint after each set of subfiles is downloaded
        save_checkpoint(path)

# Delete the checkpoint file after successful completion
if reached_checkpoint and os.path.exists(checkpoint_file):
    os.remove(checkpoint_file)

print("Download completed.")

Thank you so much!

I tried using a checkpoint file, but it doesn't resume where it left off.

Original Q&A

How do I resume where I left off when web-scraping through python?

There are 0 best solutions below

Related Questions in PYTHON

Related Questions in WEB-SCRAPING

Related Questions in CHECKPOINT

Trending Questions

Popular # Hahtags

Popular Questions