How to pull multiple sub-table data as a single table data in JavaScript with Cheerio

102 Views Asked by At

I want to download the whole table data from this webpage. It has five sub tables having its own thead and tbody each, under 'table[class="calendar"]'. My codes below can pull all thead texts and tbody texts as whole, but doesn't group them by each sub table.

I want to pull thead texts and tbody texts from each sub tables and then combine them to one table so that I can have the whole organized table data just in the same arrangement as the webpage shows. How can I do that?

function test() {
  const url = "https://finviz.com/calendar.ashx";
  const res = UrlFetchApp.fetch(url, {
    muteHttpExceptions: true,
  }).getContentText();
  const $ = Cheerio.load(res);

  var thead = $("thead")
    .find("th")
    .toArray()
    .map(el => $(el).text());
  var tableHead = [],
    column = 9;
  while (thead.length) tableHead.push(thead.splice(0, column)); //Convert 1D array to 2D array
  console.log(tableHead);

  var tcontents = $("body > div.content")
    .find("td")
    .toArray()
    .map(el => $(el).text());
  var idx = tcontents.indexOf("No economic releases");
  if (idx) tcontents.splice(idx + 1, 0, "", "", "", "", "", ""); // Add empty elemets to match number of table columns
  var tableContents = [],
    column = 9;
  while (tcontents.length)
    tableContents.push(tcontents.splice(0, column)); //Convert 1D array to 2D array
  tableContents.pop(); // remove last empty array
  console.log(tableContents);
}

My expected output is: enter image description here

3

There are 3 best solutions below

3
Newbie On BEST ANSWER

I figured out a solution as below. I'm using Google Apps Script.

function test() {

  const url = 'https://finviz.com/calendar.ashx';
  const res = UrlFetchApp.fetch(url, { muteHttpExceptions: true }).getContentText();
  const $ = Cheerio.load(res);
  var table = [];

  for (var i = 2; i < 15; i += 3) {
    var tableData = $('body > div.content > div > div > table:nth-child(' + i + ')');
    var thead = tableData.find('th').toArray().map(el => $(el).text());
    var tcontents = tableData.find('td').toArray().map(el => $(el).text());
    var idx = tcontents.indexOf('No economic releases');
    if (idx > -1) {
      tcontents.splice(idx + 1, 0, '', '', '', '', '')// Add empty elemets to match number of table columns
    }
    else tcontents.slice(5);
    var tableContents = [], column = 9;
    while (tcontents.length) tableContents.push(tcontents.splice(0, column)); //Convert 1D array to 2D array

    var table = [...table, ...[thead], ...tableContents]
  }

  console.log(table)
}
2
ggorlen On

I don't see any sub tables (tables inside of other tables) here, nor do I see 'table[class="calendar"]'. In general, avoid [class=...] syntax since it's ultra-rigid and fails if there are other classes present, or the classes are in a different order. Prefer table.calendar.

Try using nesting in your scraping to preserve the structure. while and splice are slidelines, not usually used for much of anything. map is the primary array transformation tool.

To write a CSV in Node:

const {writeFile} = require("node:fs/promises");
const cheerio = require("cheerio"); // ^1.0.0-rc.12
const Papa = require("papaparse"); // ^5.4.1

const url = "<Your URL>";

fetch(url)
  .then(res => {
    if (!res.ok) {
      throw Error(res.statusText);
    }

    return res.text();
  })
  .then(html => {
    const $ = cheerio.load(html);
    const data = [...$(".content table")].flatMap(e => {
      const headers = [...$(e).find("th")].map(e =>
        $(e).text().trim()
      );
      const rows = [...$(e).find("tr:has(td)")].map(e =>
        [...$(e).find("td")].map(e => $(e).text().trim())
      );
      return [headers, ...rows, []];
    });
    return writeFile("out.csv", Papa.unparse(data));
  })
  .catch(err => console.error(err));

Converted to GAS:

function test() {
  const url = "<Your URL>";
  const res = UrlFetchApp.fetch(url).getContentText();
  const $ = Cheerio.load(res);
  const data = [...$(".content table")].flatMap(e => {
    const headers = [...$(e).find("th")].map(e =>
      $(e).text().trim()
    );
    const rows = [...$(e).find("tr:has(td)")].map(e =>
      [...$(e).find("td")].map(e => $(e).text().trim())
    );
    return [headers, ...rows, []];
  });
  console.log(data);
}
0
TheMaster On

You could also do this with inbuilt formula IMPORTHTML to extract each table, providing appropriate indexes and then use REDUCE to merge the multiple 2D arrays:

=REDUCE(
  IMPORTHTML("<url>","table",6),
  SEQUENCE(10,1,7),
  LAMBDA(a,c,
    VSTACK(a,
      IMPORTHTML("<url>","table",c)
    )
  )
)