Using HtmlUnit to load dynamic web apps

56 Views Asked by At

Pardon the long code, but I have been trying to make a web page loader class that can load both static and dynamic pages. The code below works fine with mostly static web pages but it won't load SPA web apps (and the CSS selector always fail), like apps than runs on Vue, React, GWT, etc. also websites that are gated by authentication (hence the HttpHeaderSpec data structure)

What could be wrong in this code:

public class WebPageLoader {
    private static final int WEB_CLIENT_POOL_SIZE = 3;
    private static final int JAVASCRIPT_WAIT_TIME = 10000;
    private static final int MAX_ATTEMPTS = 3;

    private final String targetUrl;
    private final String cssSelector;
    private final int maxAttempts;
    private final HttpHeadersSpec httpHeadersSpec;
    private Consumer<String> whenMaxAttemptsFailed;
    private static final BlockingQueue<WebClient> webClients = new LinkedBlockingQueue<>();

    static {
        // Create and initialize 3 web clients statically
        for (int i = 0; i < WEB_CLIENT_POOL_SIZE; i++) {
            webClients.add(createWebClient());
        }
    }

    public WebPageLoader(String targetUrl) {
        this(targetUrl, null, MAX_ATTEMPTS, null); // Default max attempts is 5
    }

    public WebPageLoader(String targetUrl, String cssSelector) {
        this(targetUrl, cssSelector, MAX_ATTEMPTS, null); // Default max attempts is 5
    }

    public WebPageLoader(String targetUrl, String cssSelector, HttpHeadersSpec httpHeadersSpec) {
        this(targetUrl, cssSelector, MAX_ATTEMPTS, httpHeadersSpec); // Default max attempts is 5
    }

    public WebPageLoader(String targetUrl, String cssSelector, int maxAttempts, HttpHeadersSpec httpHeadersSpec) {
        this.targetUrl = Objects.requireNonNull(targetUrl, "Target URL cannot be null");
        this.cssSelector = cssSelector;
        this.httpHeadersSpec = httpHeadersSpec != null ? httpHeadersSpec : new HttpHeadersSpec();
        this.maxAttempts = MAX_ATTEMPTS;
    }

    public void whenMaxAttemptsFailed(Consumer<String> whenMaxAttemptsFailed) {
        this.whenMaxAttemptsFailed = whenMaxAttemptsFailed;
    }

    public Webpage loadWebPageAfterCSSSelectorIsReady() throws InterruptedException, IOException {
        WebClient webClient = getAvailableWebClient();
        try {
            HtmlPage page = prepareAndLoadPage(webClient);
            if (cssSelector != null) waitForSelector(page);
            return createWebpage(page);
        } finally {
            // Release the web client for reuse
            releaseWebClient(webClient);
        }
    }

    protected HtmlPage prepareAndLoadPage(WebClient webClient) throws IOException {
        long startTime = System.currentTimeMillis();
        String domain = LinkProcessor.getDomainName(targetUrl);

        if(httpHeadersSpec != null) {
            applyHttpHeaders(webClient, domain, httpHeadersSpec);
        }

        HtmlPage page = webClient.getPage(targetUrl);

        if (httpHeadersSpec != null && httpHeadersSpec.getLocal_storage() != null) {
            for (StorageItemSpec item : httpHeadersSpec.getLocal_storage()) {
                page.executeJavaScript(
                        "localStorage.setItem('" + item.getKey() + "', '" + item.getValue() + "');");
            }
        }

        // Refresh the page after loading keys to the local storage to hopefully be picked up
        page.refresh();
        webClient.waitForBackgroundJavaScriptStartingBefore(JAVASCRIPT_WAIT_TIME);

        long endTime = System.currentTimeMillis();
        long executionTime = endTime - startTime;
        System.out.println("Page execution time: " + executionTime + " ms");

        return page;
    }

    protected void waitForSelector(HtmlPage page) throws InterruptedException {
        if (cssSelector != null) {
            int attempts = 0;
            while (attempts < maxAttempts) {
                DomNodeList<DomNode> elements = page.querySelectorAll(cssSelector);
                if (elements.size() > 0) {
                    break;
                }
                synchronized (page) {
                    page.wait(JAVASCRIPT_WAIT_TIME); // wait fox x seconds before trying again
                }
                attempts++;
            }
            if (attempts == maxAttempts) {
                if (whenMaxAttemptsFailed != null) {
                    whenMaxAttemptsFailed.accept(cssSelector);
                }
            }
        }
    }

    protected String getCssSelector() {
        return this.cssSelector;
    }

    private Webpage createWebpage(HtmlPage page) {
        int statusCode = page.getWebResponse().getStatusCode();
        Webpage webPage = new Webpage(page.asXml(), statusCode);
        webPage.setUrl(targetUrl);
        return webPage;
    }

    // Get an available web client from the pool or create a new one if none is available
    protected WebClient getAvailableWebClient() throws InterruptedException {
        WebClient webClient;
        synchronized (webClients) {
            while (webClients.isEmpty()) {
                try {
                    webClients.wait();
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                }
            }
            webClient = webClients.take();
        }
        return webClient;
    }

    // Release the web client back to the pool for reuse
    protected void releaseWebClient(WebClient webClient) {
        synchronized (webClients) {
            webClients.add(webClient);
            webClients.notifyAll();
        }
    }

    private static void applyHttpHeaders(WebClient webClient, String domain, HttpHeadersSpec httpHeadersSpec) {
        // Apply cookies
        if(httpHeadersSpec.getCookies() != null) {
            httpHeadersSpec.getCookies().forEach(cookieSpec ->
                    webClient.getCookieManager().addCookie(new Cookie(domain, cookieSpec.getKey(), cookieSpec.getValue())));
        }
    }

    // Create a WebClient with the desired settings
    private static WebClient createWebClient() {
        WebClient webClient = new WebClient(BrowserVersion.BEST_SUPPORTED);
        webClient.getOptions().setUseInsecureSSL(false);
        webClient.getOptions().setAppletEnabled(false);
        webClient.getOptions().setDownloadImages(false);
        webClient.getOptions().setPopupBlockerEnabled(true);
        webClient.getOptions().setRedirectEnabled(true);
        webClient.getOptions().setJavaScriptEnabled(true);
        webClient.getOptions().setThrowExceptionOnScriptError(false);
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webClient.getOptions().setPrintContentOnFailingStatusCode(false);
        webClient.setJavaScriptErrorListener(new SilentJavaScriptErrorListener());
        webClient.getOptions().setCssEnabled(true);
        webClient.setCssErrorHandler(new SilentCssErrorHandler());
        webClient.getCookieManager().setCookiesEnabled(true);
        return webClient;
    }
}
1

There are 1 best solutions below

3
RBRi On

There are still some problems with these applications manly because of some missing DOM features (e.g. ShadowDOM) or some problems with the javascript support. Both areas are constantly improving but we have to look at this from case to case. If you are able to open an issue pointing to a missing or misbehaving function (and maybe provide a small test case) there is a good chance to get this fixed/improved.

Outside of this general problems it might be a good idea to have a look at the HtmlUnitBrowser class from the Wetator (www.wetator.org) project. This has many generic implementations you might find useful to get some ideas how you can handle common problems.

https://github.com/Wetator/wetator/blob/master/src/main/java/org/wetator/backend/htmlunit/HtmlUnitBrowser.java

E.g have a look at waitForImmediateJobs() or getCurrentPage().