I have included steps to close the selenium python webdriver, and even restart it regularly to avoid accumulating problems. But my Windows 10 Task Manager shows a steady increase in memory and CPU usage up to the point of saturation and freezing up my computer, and the number of Chrome instances proliferate, despite my steps to close the webdriver regularly:
import csv
import json
import os
import time
from selenium import webdriver
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import logging
from selenium.webdriver.chrome.options import Options
# Setup logging
current_time = time.strftime("%Y%m%d_%H%M%S")
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s',
handlers=[
logging.FileHandler(f"log_{current_time}.log"),
logging.StreamHandler()
])
logger = logging.getLogger()
# Global driver variable
global_driver = None
def close_driver():
global global_driver
if global_driver:
try:
global_driver.quit()
except Exception as e:
logger.error(f"Error closing the driver: {e}")
global_driver = None
def create_driver():
close_driver() # Ensure any existing driver is closed
try:
global global_driver
chrome_options = Options()
chrome_options.add_argument("--headless") # Run Chrome in headless mode
global_driver = uc.Chrome(options=chrome_options)
return global_driver
except Exception as e:
logger.error(f"Error creating new driver instance: {e}")
return None
# Global setting for scraping quantity (1 or 2)
SCRAPE_QTY = 2 # Set to 2 if scraping for 2 qty values
representative_postcodes = [
("SYDMET", "EASTGARDENS", "NSW", "2036"),
("NSWTWN", "BAY VILLAGE", "NSW", "2261"),
("NSWREG", "CATTLE CREEK", "NSW", "2339"),
("QLDTWN", "BOTTLE CREEK", "QLD", "2469"),
("MELMET", "CROSS KEYS", "VIC", "3041"),
("VICTWN", "BELL PARK", "VIC", "3215"),
("VICREG", "TERANG", "VIC", "3264"),
("BRIMET", "ASPLEY", "QLD", "4034"),
("QLDREG", "CARPENDALE", "QLD", "4344"),
("ADEMET", "OAKLANDS PARK", "SA", "5046"),
("SAREG", "CAPE JERVIS", "SA", "5204"),
("PERMET", "KARAKIN", "WA", "6044"),
("WAREG", "BALBARRUP", "WA", "6258"),
("TAZTWN", "CAPE PILLAR", "TAS", "7182"),
("TASMAN", "BLACK HILLS", "TAS", "7140"),
("DARMET", "ANULA", "NT", "0812"),
("NTREG", "ALICE SPRINGS", "NT", "0870")
]
def read_source_csv(file_path):
try:
with open(file_path, newline='', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)
return list(reader)
except Exception as e:
logger.error(f"Error reading CSV file: {e}")
return []
def append_to_csv(file_name, data):
try:
file_exists = os.path.isfile(file_name)
with open(file_name, 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
if not file_exists:
writer.writerow(['sku', 'postcode_group', 'suburb', 'state', 'postcode', 'rate'])
writer.writerow(data)
except Exception as e:
logger.error(f"Error writing to CSV file: {e}")
def save_last_processed(sku, postcode_group):
try:
with open('last_processed.json', 'w') as file:
json.dump({'last_processed_sku': sku, 'last_processed_postcode_group': postcode_group}, file)
except Exception as e:
logger.error(f"Error saving last processed record: {e}")
def load_last_processed():
try:
with open('last_processed.json', 'r') as file:
return json.load(file)
except FileNotFoundError:
return {'last_processed_sku': None, 'last_processed_postcode_group': None}
except Exception as e:
logger.error(f"Error loading last processed record: {e}")
return {'last_processed_sku': None, 'last_processed_postcode_group': None}
def scrape_shipping_rates(driver, sku, product_url, postcode_group):
try:
driver.get(product_url)
logger.info("Page loaded.") # Debugging print
# Check for the 404 error message on the page
try:
WebDriverWait(driver, 3).until(
EC.presence_of_element_located((By.XPATH, "//h1[text()='Whoops, our bad...']"))
)
logger.info(f"404 Error for {sku} at {product_url}")
append_to_csv('404.csv', [sku, *postcode_group, '404 Error'])
return '404 Error'
except TimeoutException:
# If the 404 message is not found, continue with the scraping
logger.error("No 404 Error detected. Continuing scraping.")
# Handling quantity selection if SCRAPE_QTY is set to 2
if SCRAPE_QTY == 2:
try:
qty_input = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "qty"))
)
qty_input.clear()
qty_input.send_keys("2")
logger.info("Quantity set to 2.") # Debugging print
except TimeoutException:
logger.error("Quantity input not found or loading issue.")
# Execute JavaScript to get HTTP status code
status_code = driver.execute_script(
"return window.performance.getEntriesByType('navigation')[0].responseStart;"
)
# Wait for the city input to appear and enter the postcode
city_input = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "city"))
)
logger.info("City input found.") # Debugging print
city_input.clear() # Clear the input field
city_input.send_keys(postcode_group[3]) # Enter the representative postcode
logger.info("Postcode entered.") # Debugging print
time.sleep(2)
suggestion = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#suggetion-box ul li"))
)
logger.info("Suggestion box found.") # Debugging print
suggestion.click()
logger.info("Suggestion clicked.") # Debugging print
# Click the Get Rate button
get_rate_button = driver.find_element(By.ID, "get_rate")
get_rate_button.click()
logger.info("Get rate button clicked.") # Debugging print
# Check for error message
try:
WebDriverWait(driver, 3).until(
EC.visibility_of_element_located((By.ID, "shipping_rate_estimation_error"))
)
append_to_csv('red_errormsg_returned.csv', [sku, *postcode_group, 'Error'])
return
except TimeoutException as e:
logger.error(f"TimeoutException for error message check: {e}")
# No error message, continue
# Check for rate table
try:
WebDriverWait(driver, 3).until(
EC.visibility_of_element_located((By.ID, "result-table"))
)
rate_span = driver.find_element(By.CSS_SELECTOR, "#result-table .price")
rate = rate_span.text.replace('$', '').replace(',', '')
append_to_csv('fed_scraped_shipping_rates.csv', [sku, *postcode_group, rate])
except TimeoutException as e:
logger.error(f"TimeoutException for rate table check: {e}")
append_to_csv('rate_not_detected.csv', [sku, *postcode_group, 'No Rate'])
# except Exception as e:
# logger.info(f"Error processing SKU {sku} for postcode group {postcode_group[0]}: {e}")
except Exception as e:
error_message = str(e)
if "Out of Memory" in error_message or "Timed out receiving message from renderer" in error_message:
logger.error(f"Out of Memory error processing SKU {sku} for postcode group {postcode_group[0]}")
append_to_csv('out_of_memory_errors.csv', [sku, *postcode_group, 'Out of Memory'])
return 'Out of Memory'
else:
logger.error(f"Error processing SKU {sku} for postcode group {postcode_group[0]}: {e}")
return 'Error'
return 'Success'
def find_starting_index(source_data, last_processed_sku):
for index, row in enumerate(source_data):
if row['SKU'] == last_processed_sku:
return index
return 0
def get_postcode_group_index(postcode_group_name):
for index, (name, _, _, _) in enumerate(representative_postcodes):
if name == postcode_group_name:
return index
return -1
def process_sku(driver, row, postcode_group):
try:
result = scrape_shipping_rates(driver, row['SKU'], row['Product URL'], postcode_group)
if result == 'Success':
logger.info(f"Successfully processed SKU: {row['SKU']}, Postcode Group: {postcode_group[0]}")
save_last_processed(row['SKU'], postcode_group[0])
elif result == 'Out of Memory':
# Don't update last_processed, but return the 'Out of Memory' status
logger.error(f"Out of Memory error for SKU: {row['SKU']}, Postcode Group: {postcode_group[0]}")
return 'Out of Memory'
else:
logger.error(f"Error encountered for SKU: {row['SKU']}, Postcode Group: {postcode_group[0]}")
time.sleep(1)
return result
except Exception as e:
# Close the driver in case of an exception and then re-raise the exception
if driver:
driver.quit()
raise e
def read_processed_skus(file_name):
processed_skus = set()
if os.path.isfile(file_name):
with open(file_name, 'r', encoding='utf-8') as file:
reader = csv.reader(file)
next(reader, None) # Skip header
for row in reader:
processed_skus.add((row[0], row[1])) # SKU and postcode group
return processed_skus
def main():
global global_driver
logger.info("Starting script...")
try:
source_data = read_source_csv("Data feed-22.01.24.csv")
if not source_data:
logger.info("No data found in source file, exiting.")
return
processed_skus = read_processed_skus("fed_scraped_shipping_rates.csv")
errors = read_processed_skus("out_of_memory_errors.csv") | read_processed_skus("rate_not_detected.csv") | read_processed_skus("red_errormsg_returned.csv") | read_processed_skus("404.csv")
process_count = 0
driver = create_driver() # Create a new driver instance
for row in source_data:
for postcode_group in representative_postcodes:
if (row['SKU'], postcode_group[0]) in processed_skus or (row['SKU'], postcode_group[0]) in errors:
continue
result = process_sku(driver, row, postcode_group)
process_count += 1
if result == 'Out of Memory' or process_count >= 25:
driver.quit() # Close the current driver
driver = create_driver() # Create a new driver instance
process_count = 0
except Exception as e:
logger.error(f"An error occurred in the main function: {e}")
finally:
close_driver()
logger.info("Script finished.")
if __name__ == '__main__':
main()
It is not related to UC. Tested and verified the same using latest selenium.