This is a problem in progress following the previous work to receive images from web pages to avoid interference such as cloudflare.
The method to receive images is get response data of image loaded on the screen from Chrome devtools when I enter the url.
If I put the chrome browser at the front of the screen and read it by scrolling down autonomously, all images will download well.
However, if the browser is placed so that it is hidden by headless, minimize, or other windows, the images are not cached and downloaded.
Is there any way to solve this?
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import subprocess
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import datetime,time
import base64
import json
current_dir = os.path.dirname(os.path.realpath(__file__))
down_dir = os.path.join(current_dir, 'download')
path = "chromedriver.exe"
sp = subprocess.Popen(f'C://Program Files//Google//Chrome//Application//chrome.exe --remote-debugging-port=9222 --auto-open-devtools-for-tabs --ignore-certificate-errors')
# --headless
option = Options()
option.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
option.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
browser = webdriver.Chrome(options=option)
# browser.minimize_window() # it is not working for image download, headless too
# start crawling
browser.get(url)
html_source = browser.page_source
bs = BeautifulSoup(html_source, features="lxml")
title = bs.find("title").text
page_contents = bs.find("div", class_="listing")
def move_end(browser):
################# slowly scroll down
total_page_height = browser.execute_script("return document.body.scrollHeight")
browser_window_height = browser.get_window_size(windowHandle='current')['height']
current_position = browser.execute_script('return window.pageYOffset')
while total_page_height - current_position > browser_window_height:
browser.execute_script(f"window.scrollTo({current_position}, {browser_window_height + current_position});")
current_position = browser.execute_script('return window.pageYOffset')
total_page_height = browser.execute_script("return document.body.scrollHeight")
time.sleep(0.4) # It is necessary here to give it some time to
# collect all sub pages
for td in page_contents.findAll('a'):
page_title = td.text.strip()
if page_title:
href = td.attrs['href']
href = urljoin(url, href)
browser.get(href)
time.sleep(2)
# extract images from each sub page
html_source = browser.page_source
bs = BeautifulSoup(html_source, features="lxml")
image_contents = bs.find("div", class_="content")
img_urls = [image['data-src'] for image in image_contents.findAll('img')]
# move to end of page for get image contents
move_end(browser)
time.sleep(1)
os.makedirs(os.path.join(down_dir, title, page_title), exist_ok=True)
# get response log
logs = browser.get_log("performance")
for log in logs:
message = log["message"]
if "Network.responseReceived" in message:
params = json.loads(message)["message"].get("params")
if params:
response = params.get("response")
if response:
if response and response["url"] in img_urls:
try:
body = browser.execute_cdp_cmd('Network.getResponseBody', {'requestId': params["requestId"]})
file_name = response["url"].split('/')[-1]
with open(os.path.join(down_dir, title, page_title, file_name), 'wb') as img:
img.write(base64.b64decode(body['body']))
img_urls.remove(response["url"])
except:
pass
else:
continue
sp.terminate()