How do I resume where I left off when web-scraping through python?

87 Views Asked by At

I'm new to web scraping.

So far, I've gotten the data to download onto my computer perfectly fine, but since we're downloading so much data, sometimes there is a network error or my computer shuts off.

I tried installing a checkpoint file, but it never starts from where I need it to. Instead, it just goes back to the beginning.

Is there any way to ensure that Python resumes downloading where it left off? I've tried everything, and I can't seem to find the answer.

Here is the coding sequence:

import requests
from bs4 import BeautifulSoup
import os

def save_table_data(table, filename):
    rows = table.find_all('tr')
    with open(filename, 'w') as file:
        for row in rows:
            cells = row.find_all(['th', 'td'])
            cell_data = [cell.get_text(strip=True) for cell in cells]
            file.write('\t'.join(cell_data) + '\n')

def download_tables(url, path, checkpoint=None):
    try:
        response = session.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while accessing {url}: {str(e)}")
        return

    linked_page_soup = BeautifulSoup(response.content, 'html.parser')
    linked_tables = linked_page_soup.find_all('div', {'class': 'table-responsive'}) + \
                    linked_page_soup.find_all('table', {'class': 'table table-striped table-bordered'})

    for index, linked_table in enumerate(linked_tables):
        if checkpoint and index < checkpoint[0]:
            continue

        filename = os.path.join(path, f'data_{index + 1}.txt')
        save_table_data(linked_table, filename)
        print(f"{filename} downloaded successfully.")

        # Extract URLs from the table rows
        rows = linked_table.find_all('tr')
        urls = []
        for row in rows:
            link = row.find('a', href=True)
            if link:
                urls.append(link['href'])

        for sub_index, sub_url in enumerate(urls):
            if checkpoint and index == checkpoint[0] and sub_index < checkpoint[1]:
                continue

            sub_path = os.path.join(path, f'sub_{index + 1}_{sub_index + 1}')
            os.makedirs(sub_path, exist_ok=True)
            download_tables(sub_url, sub_path, checkpoint=(index, sub_index))

def save_checkpoint(checkpoint):
    with open(checkpoint_file, 'w') as file:
        if checkpoint:
            file.write(checkpoint)
        else:
            file.write("0,0")

# Replace with your actual credentials
username = 'ministry'
password = 'ministry@2022'

# Set up the session
session = requests.Session()

# Log in to the website
login_url = 'https://tricorniotec.com/webapp/portal'
login_data = {
    'username': username,
    'password': password
}
try:
    response = session.post(login_url, data=login_data)
    response.raise_for_status()
except requests.exceptions.RequestException as e:
    print(f"Error occurred during login: {str(e)}")
    exit()

# Access the dashboard
dashboard_url = 'https://tricorniotec.com/webapp/ministry/dashboard'
try:
    response = session.get(dashboard_url)
    response.raise_for_status()
except requests.exceptions.RequestException as e:
    print(f"Error occurred while accessing the dashboard: {str(e)}")
    exit()

# Parse the content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table with the class "table-responsive"
table = soup.find('div', {'class': 'table-responsive'})

# Extract all table rows
rows = table.find_all('tr')

# Extract URLs from the table rows
urls = []
for row in rows:
    link = row.find('a', href=True)
    if link:
        urls.append(link['href'])

# Check if there is a checkpoint file
checkpoint_file = '/path/to/your/checkpoint_file.txt'
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'r') as file:
        try:
            checkpoint = file.read().strip()
        except ValueError:
            print("Invalid checkpoint file format. Starting from the beginning.")
            checkpoint = None
else:
    checkpoint = None

# Set the flag to indicate if the checkpoint has been reached
reached_checkpoint = False

# Visit each URL and download the table data
for index, url in enumerate(urls):
    path = f'data_1/sub_1_3/sub_1_37/sub_1_95/data_21'
    os.makedirs(path, exist_ok=True)
    
    if checkpoint and not reached_checkpoint:
        if checkpoint == path:
            reached_checkpoint = True
        continue

    download_tables(url, path)

    if reached_checkpoint:
        # Save the checkpoint after each set of subfiles is downloaded
        save_checkpoint(path)

# Delete the checkpoint file after successful completion
if reached_checkpoint and os.path.exists(checkpoint_file):
    os.remove(checkpoint_file)

print("Download completed.")

Thank you so much!

I tried using a checkpoint file, but it doesn't resume where it left off.

0

There are 0 best solutions below