fs.writeFile crashes node app after writing first json file

1k Views Asked by At

I'm trying to crawl several web pages to check broken links and writing the results of the links to a json files, however, after the first file is completed the app crashes with no error popping up...

I'm using Puppeteer to crawl, Bluebird to run each link concurrently and fs to write the files.

WHAT IVE TRIED:

  • switching file type to '.txt' or '.php', this works but I need to create another loop outside the current workflow to convert the files from '.txt' to '.json'. Renaming the file right after writing to it also causes the app to crash.
  • using try catch statements for fs.writeFile but it never throws an error
  • the entire app outside of express, this worked at some point but i trying to use it within the framework
const express = require('express');
const router = express.Router();
const puppeteer = require('puppeteer');
const bluebird = require("bluebird");
const fs = require('fs');

router.get('/', function(req, res, next) {
    (async () => {
        // Our (multiple) URLs.
        const urls = ['https://www.testing.com/allergy-test/', 'https://www.testing.com/genetic-testing/'];
    
        const withBrowser = async (fn) => {
            const browser = await puppeteer.launch();
            try {
                return await fn(browser);
            } finally {
                await browser.close();
            }
        }
    
        const withPage = (browser) => async (fn) => {
            const page = await browser.newPage();

            // Turns request interceptor on.
            await page.setRequestInterception(true);

            // Ignore all the asset requests, just get the document.
            page.on('request', request => {
                if (request.resourceType() === 'document' ) {
                    request.continue();
                } else {
                    request.abort();
                }
            });

            try {
                return await fn(page);
            } finally {
                await page.close();
            }
        }
    
        const results = await withBrowser(async (browser) => {
            return bluebird.map(urls, async (url) => {
                return withPage(browser)(async (page) => {                    
                    await page.goto(url, {
                        waitUntil: 'domcontentloaded',
                        timeout: 0 // Removes timeout.
                    });
    
                    // Search for urls we want to "crawl".
                    const hrefs = await page.$$eval('a[href^="https://www.testing.com/"]', as => as.map(a => a.href));

                    // Predefine our arrays.
                    let links = [];
                    let redirect = [];
    
                    // Loops through each /goto/ url on page
                    for (const href of Object.entries(hrefs)) {
                        response = await page.goto(href[1], {
                            waitUntil: 'domcontentloaded',
                            timeout: 0 // Remove timeout.
                        });
                        const chain = response.request().redirectChain();
    
                        const link = {      
                            'source_url': href[1],
                            'status': response.status(),
                            'final_url': response.url(),
                            'redirect_count': chain.length,
                        };
                        // Loops through the redirect chain for each href.
                        for ( const ch of chain) {
                            redirect = {
                                status: ch.response().status(),
                                url: ch.url(),
                            };
                        }

                        // Push all info of target link into links
                        links.push(link);
                    }  
                    // JSONify the data.
                    const linksJson = JSON.stringify(links);

                    fileName = url.replace('https://www.testing.com/', '');
                    fileName = fileName.replace(/[^a-zA-Z0-9\-]/g, '');
                        
                    // Write data to file in /tmp directory.
                    fs.writeFile(`./tmp/${fileName}.json`, linksJson, (err) => {
                        if (err) {
                            return console.log(err);
                        }
                    });
                });
            }, {concurrency: 4}); // How many pages to run at a time.
        });
    })();   
});
module.exports = router;

UPDATE: So there is nothing wrong with my code... I realized nodemon was stopping the process after each file was saved. Since nodemon would detect a "file change" it kept restarting my server after the first item

0

There are 0 best solutions below