Browse html with horseman after scraping

233 Views Asked by At

I am using Horsmenan to scrape a website in order to build some graph with the extracted data.
I managed to get the root element of each big part with my code but I don't know how to browse each element inside.
What i want to do is build json with some child element such as :

  • company-name
  • company-stack (which contains a ul list)

This is my code so far :

router.get('/', function(req, res, next) {
  //All the web scraping magic will happen here

  var url = "http://www.welcometothejungle.co/stacks?q=&hPP=30&idx=cms_companies_stacks_production&p=";

  const pages = [0,1,2,3,4];
  pages.forEach((page) => {
    const horseman = new Horseman();
    horseman
        .open(url + '' + page)
        .html('article')
        .then((text) => {
            console.log(`${text}`);
        })
        .close();
  });
  res.render('index', {title :"Done"});

});

How can I browse the 'text' result variable ?

1

There are 1 best solutions below

0
Stephane Karagulmez On

I managed to parse the data using an other module called cheerio. If you have a way of doing it with horseman this could be interesting !

horseman
        .open(url + '' + page)
        .html('article')
        .then((htmlRes) => {
          if(htmlRes){
            var item = {}; //container for one article info
            //Loading data in cheerio to parse it
            var $ = cheerio.load(htmlRes);
            //First step : get the title
            $('h4[class=company-name]').each(function(i, elem) {
              //Delete the span jobs
              //Delete the number of jobs
              var t = $(this).text().replace(/\s+/g, '').replace(/\d+/g, '');
              //Delete the word jobs
              var tRes = t.substr(0, t.length-4);
              item.company = tRes;
            });
            var stacksF = []; //container for the list of different stacks categories
            $('div[class=company-stack-category]').each(function(i, elem) {
              var obj = {}; //one stack categorie
              var stacks = []; //list of item in the aimed stack
              obj.stackName = $(this).children('.category-title').text(); //name of the stakc
              $(this).children('.company-stack-list').children('.stack-item').each(function(c, elem) {
                stacks[c] = $(this).text(); //one stack element
              });
              var stacksRes = stacks.join(', '); //join all the stack element in a unique string
              stacksRes = stacksRes.replace(/\s+/g, '');
              obj.stackValue = stacksRes;
              stacksF.push(obj);
            });
            item.stacks = stacksF;
            articles.push(item);
          }else{
            console.log("Impossible to retrieve data");
          }
        })
        .close();