I'm new to web scraping.
So far, I've gotten the data to download onto my computer perfectly fine, but since we're downloading so much data, sometimes there is a network error or my computer shuts off.
I tried installing a checkpoint file, but it never starts from where I need it to. Instead, it just goes back to the beginning.
Is there any way to ensure that Python resumes downloading where it left off? I've tried everything, and I can't seem to find the answer.
Here is the coding sequence:
import requests
from bs4 import BeautifulSoup
import os
def save_table_data(table, filename):
rows = table.find_all('tr')
with open(filename, 'w') as file:
for row in rows:
cells = row.find_all(['th', 'td'])
cell_data = [cell.get_text(strip=True) for cell in cells]
file.write('\t'.join(cell_data) + '\n')
def download_tables(url, path, checkpoint=None):
try:
response = session.get(url)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Error occurred while accessing {url}: {str(e)}")
return
linked_page_soup = BeautifulSoup(response.content, 'html.parser')
linked_tables = linked_page_soup.find_all('div', {'class': 'table-responsive'}) + \
linked_page_soup.find_all('table', {'class': 'table table-striped table-bordered'})
for index, linked_table in enumerate(linked_tables):
if checkpoint and index < checkpoint[0]:
continue
filename = os.path.join(path, f'data_{index + 1}.txt')
save_table_data(linked_table, filename)
print(f"{filename} downloaded successfully.")
# Extract URLs from the table rows
rows = linked_table.find_all('tr')
urls = []
for row in rows:
link = row.find('a', href=True)
if link:
urls.append(link['href'])
for sub_index, sub_url in enumerate(urls):
if checkpoint and index == checkpoint[0] and sub_index < checkpoint[1]:
continue
sub_path = os.path.join(path, f'sub_{index + 1}_{sub_index + 1}')
os.makedirs(sub_path, exist_ok=True)
download_tables(sub_url, sub_path, checkpoint=(index, sub_index))
def save_checkpoint(checkpoint):
with open(checkpoint_file, 'w') as file:
if checkpoint:
file.write(checkpoint)
else:
file.write("0,0")
# Replace with your actual credentials
username = 'ministry'
password = 'ministry@2022'
# Set up the session
session = requests.Session()
# Log in to the website
login_url = 'https://tricorniotec.com/webapp/portal'
login_data = {
'username': username,
'password': password
}
try:
response = session.post(login_url, data=login_data)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Error occurred during login: {str(e)}")
exit()
# Access the dashboard
dashboard_url = 'https://tricorniotec.com/webapp/ministry/dashboard'
try:
response = session.get(dashboard_url)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Error occurred while accessing the dashboard: {str(e)}")
exit()
# Parse the content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find the table with the class "table-responsive"
table = soup.find('div', {'class': 'table-responsive'})
# Extract all table rows
rows = table.find_all('tr')
# Extract URLs from the table rows
urls = []
for row in rows:
link = row.find('a', href=True)
if link:
urls.append(link['href'])
# Check if there is a checkpoint file
checkpoint_file = '/path/to/your/checkpoint_file.txt'
if os.path.exists(checkpoint_file):
with open(checkpoint_file, 'r') as file:
try:
checkpoint = file.read().strip()
except ValueError:
print("Invalid checkpoint file format. Starting from the beginning.")
checkpoint = None
else:
checkpoint = None
# Set the flag to indicate if the checkpoint has been reached
reached_checkpoint = False
# Visit each URL and download the table data
for index, url in enumerate(urls):
path = f'data_1/sub_1_3/sub_1_37/sub_1_95/data_21'
os.makedirs(path, exist_ok=True)
if checkpoint and not reached_checkpoint:
if checkpoint == path:
reached_checkpoint = True
continue
download_tables(url, path)
if reached_checkpoint:
# Save the checkpoint after each set of subfiles is downloaded
save_checkpoint(path)
# Delete the checkpoint file after successful completion
if reached_checkpoint and os.path.exists(checkpoint_file):
os.remove(checkpoint_file)
print("Download completed.")
Thank you so much!
I tried using a checkpoint file, but it doesn't resume where it left off.