Parse an HTML table without CSS identifiers or CSS selectors in Node.js

This data is taken from the old system, and the output is as is. We cannot add CSS selectors or identifiers. Most of the online examples for parsing node.js include parsing tables, rows, data with some ID or CSS classes, but so far I have not come across anything that can help parse the page below. These include examples for JSDOM (AFAIK).

I would like to extract each of the lines to the root files [fileName, link, size, dateTime], after which I can run some queries, such as the last timestamp in the group, etc., and then extract the file name and link - I thought about using YQL. Attributes of alternating table rows also make the task harder. New in node.js, so some terms may be incorrect. Any help would be appreciated.

Thank.

<html>
<body>
    <table width="100%" cellspacing="0" cellpadding="5" align="center">
        <tr> 
        <td align="left"><font size="+1"><strong>Filename</strong></font></td>
        <td align="center"><font size="+1"><strong>Size</strong></font></td>
        <td align="right"><font size="+1"><strong>Last Modified</strong></font></td>
        </tr>
        <tr>
        <td align="left">&nbsp;&nbsp;
        <a href="/path_to_file.csv"><tt>file1.csv</tt></a></td>
        <td align="right"><tt>86.6 kb</tt></td>
        <td align="right"><tt>Fri, 21 Mar 2014 21:00:19 GMT</tt></td>
        </tr>
        <tr bgcolor="#eeeeee">
        <td align="left">&nbsp;&nbsp;
        <a href="/path_to_file.csv"><tt>file2.csv</tt></a></td>
        <td align="right"><tt>20.7 kb</tt></td>
        <td align="right"><tt>Fri, 21 Mar 2014 21:00:19 GMT</tt></td>
        </tr>
        <tr>
        <td align="left">&nbsp;&nbsp;
        <a href="/path_to_file.xml"><tt>file1.xml</tt></a></td>
        <td align="right"><tt>266.5 kb</tt></td>
        <td align="right"><tt>Fri, 21 Mar 2014 21:00:19 GMT</tt></td>
        </tr>
        <tr bgcolor="#eeeeee">
        <td align="left">&nbsp;&nbsp;
        <a href="/path_to_file.xml"><tt>file2.xml</tt></a></td>
        <td align="right"><tt>27.2 kb</tt></td>
        <td align="right"><tt>Fri, 21 Mar 2014 21:00:19 GMT</tt></td>
        </tr>
    </table>
</body>
</html>

Answer (thanks @Enragedmrt):

    res.on('data', function(data) {

        $ = cheerio.load(data.toString());
        var data = [];
        $('tr').each(function(i, tr){

            var children = $(this).children();
            var fileItem = children.eq(0);
            var linkItem = children.eq(0).children().eq(0);
            var lastModifiedItem = children.eq(2);

            var row = {
                "Filename": fileItem.text().trim(),
                "Link": linkItem.attr("href"),
                "LastModified": lastModifiedItem.text().trim()
            };
            data.push(row);
            console.log(row);
        });
    });
+4
source share
4 answers

Cheerio JSDOM, . , 'tr', 'td' . (My Node.js/Cheerio , JQuery, ):

var data = [];
$('tr').each(function(i, tr){
    var children = $(this).children();
    var row = {
        "Filename": children[0].text(),
        "Size": children[1].text(),
        "Last Modified": children[2].text()
    };
    data.push(row);
});
+7

JSDom, , HTML- DOM (Document Object Model). , , .

Google 5 , , ...

JSDom GitHub, , . jQuery, window.$("a.the-link").text(). td, th , , td[align="left"]. , .first .each, (, ), , , , , .

, JSDom, , .

0

JSFiddle

var rawData = new Array();
var rows = document.getElementsByTagName('tr');
for(var cnt = 1; cnt < rows.length; cnt++) {
    var cells = rows[cnt].getElementsByTagName('tt');
    var row = [];
    for (var count = 0; count < cells.length; count++) {
        row.push(cells[count].innerText.trim());
    }    
    rawData.push(row);
}

console.log(rawData);
0

var cheerio = require('cheerio'),
    cheerioTableparser = require('cheerio-tableparser');

res.on('data', function(data) {

    $ = cheerio.load(data.toString());
    cheerioTableparser($);
    var data = [];
    var array = $("table").parsetable(false, false, false)
    array[0].forEach(function(d, i) {

        var firstColumnHTMLCell = $("<div>" + array[0][i] + "</div>");
        var fileItem = firstColumnHTMLCell.text().trim();
        var linkItem = firstColumnHTMLCell.find("a").attr("href");
        var lastModifiedItem = $("<div>" + array[2][i] + "</div>").text();

        var row = {
            "Filename": fileItem,
            "Link": linkItem,
            "LastModified": lastModifiedItem
        };

        data.push(row);
        console.log(row);
    })
});
0

Source: https://habr.com/ru/post/1532911/


All Articles