Please note that this version of the site is no longer being updated. All content has been merged with my IT Blog "Much Ado About IT". Click here to see this page on the other site. I've found that using Jekyll to be too fragile and complex. GitHub pages are too limited. The blog site was on WordPress (self-hosted) but that was too slow and clunky. The revamped blog now uses Hugo and is published using Netlify
How to extract a table from HTML
While Node-RED has some nodes for extracting data from HTML, the nodes are rather simplistic.
Here is a quick way to extract the data from an HTML table element.
To use it, you need to install the cheerio node.js module to your user data folder (usually ~/.node-red
).
Then you need to reference it in the global variables part of your settings.js
file (same location as above).
functionGlobalContext: {
cheerio: require('cheerio'),
},
Then add a flow that returns html from a web page (using the Request node).
Then add a function node with the following code:
/*jshint sub:true,asi:true,maxerr:1000*/
const tableSelector = '#ContentPlaceHolder1_grdStation'
const cheerio = global.get('cheerio')
const $ = cheerio.load(msg.payload)
const options = {
rowForHeadings: 0, // extract th cells from this row for column headings (zero-based)
ignoreHeadingRow: true, // Don't tread the heading row as data
ignoreRows: [],
}
const jsonReponse = []
const columnHeadings = []
$(tableSelector).each(function(i, table) {
var trs = $(table).find('tr')
// Set up the column heading names
getColHeadings( $(trs[options.rowForHeadings]) )
// Process rows for data
$(table).find('tr').each(processRow)
})
msg.payload = {
columnHeadings: columnHeadings,
rows: jsonReponse,
}
return msg
function getColHeadings(headingRow) {
const alreadySeen = {}
$(headingRow).find('th').each(function(j, cell) {
let tr = $(cell).text().trim()
if ( alreadySeen[tr] ) {
let suffix = ++alreadySeen[tr]
tr = `${tr}_${suffix}`
} else {
alreadySeen[tr] = 1
}
columnHeadings.push(tr)
})
}
function processRow(i, row) {
const rowJson = {}
if ( options.ignoreHeadingRow && i === options.rowForHeadings ) return
// TODO: Process options.ignoreRows
$(row).find('td').each(function(j, cell) {
rowJson[ columnHeadings[j] ] = $(cell).text().trim()
})
// Skip blank rows
if (JSON.stringify(rowJson) !== '{}') jsonReponse.push(rowJson)
}
//EOF
The result is a msg.payload
containing 2 arrays. The first is the list of column names.
The second is an array of objects, each object containing a property for each column.
Code inspired by: https://github.com/iaincollins/tabletojson/blob/master/lib/tabletojson.js and by a conversation in the Node-RED Google Group.
Example Flow
[{"id":"815bf28f.d0691","type":"inject","z":"e7463dd2.db517","name":"get data","topic":"","payload":"","payloadType":"date","repeat":"","crontab":"","once":false,"x":120,"y":100,"wires":[["46f8420f.a312dc"]]},{"id":"46f8420f.a312dc","type":"http request","z":"e7463dd2.db517","name":"","method":"GET","ret":"txt","url":"http://publicinfobanjir.water.gov.my/View/OnlineFloodInfo/PublicWaterLevel.aspx?scode=SEL","tls":"","x":330,"y":100,"wires":[["808d9c98.e305d"]]},{"id":"fe447848.e25938","type":"debug","z":"e7463dd2.db517","name":"","active":true,"complete":false,"x":810,"y":60,"wires":[]},{"id":"d0660df5.4f339","type":"function","z":"e7463dd2.db517","name":"","func":"/*jshint sub:true,asi:true,maxerr:1000*/\n\nconst tableSelector = '#ContentPlaceHolder1_grdStation'\n\nconst cheerio = global.get('cheerio')\nconst $ = cheerio.load(msg.payload)\nconst options = {\n rowForHeadings: 0, // extract th cells from this row for column headings (zero-based)\n ignoreHeadingRow: true, // Don't tread the heading row as data\n ignoreRows: [],\n}\nconst jsonReponse = []\nconst columnHeadings = []\n\n$(tableSelector).each(function(i, table) {\n var trs = $(table).find('tr')\n \n // Set up the column heading names\n getColHeadings( $(trs[options.rowForHeadings]) )\n\n // Process rows for data\n $(table).find('tr').each(processRow)\n})\n\nmsg.payload = {\n columnHeadings: columnHeadings,\n rows: jsonReponse,\n}\n\nreturn msg\n\nfunction getColHeadings(headingRow) {\n const alreadySeen = {}\n \n $(headingRow).find('th').each(function(j, cell) {\n let tr = $(cell).text().trim()\n \n if ( alreadySeen[tr] ) {\n let suffix = ++alreadySeen[tr]\n tr = `${tr}_${suffix}`\n } else {\n alreadySeen[tr] = 1\n }\n \n columnHeadings.push(tr)\n })\n}\n\nfunction processRow(i, row) {\n const rowJson = {}\n \n if ( options.ignoreHeadingRow && i === options.rowForHeadings ) return\n // TODO: Process options.ignoreRows\n \n $(row).find('td').each(function(j, cell) {\n rowJson[ columnHeadings[j] ] = $(cell).text().trim()\n })\n \n // Skip blank rows\n if (JSON.stringify(rowJson) !== '{}') jsonReponse.push(rowJson)\n}\n\n//EOF","outputs":"1","noerr":0,"x":650,"y":100,"wires":[["fe447848.e25938","c35dcf95.5cc9","641996d3.fe8118"]]},{"id":"641996d3.fe8118","type":"change","z":"e7463dd2.db517","name":"","rules":[{"t":"set","p":"selected","pt":"msg","to":"{}","tot":"json"},{"t":"set","p":"selected.station","pt":"msg","to":"payload.rows[40]['Station ID']","tot":"msg"},{"t":"set","p":"selected.level","pt":"msg","to":"payload.rows[40]['River Level (m)']","tot":"msg"},{"t":"move","p":"selected","pt":"msg","to":"payload","tot":"msg"}],"action":"","property":"","from":"","to":"","reg":false,"x":820,"y":100,"wires":[["f0152260.70e1d"]]},{"id":"f0152260.70e1d","type":"debug","z":"e7463dd2.db517","name":"","active":true,"complete":false,"x":990,"y":100,"wires":[]},{"id":"c35dcf95.5cc9","type":"change","z":"e7463dd2.db517","name":"","rules":[{"t":"set","p":"payload","pt":"msg","to":"$$.payload.rows.(\t {\t \"station\": $.'Station ID',\t \"name\": $.'Station Name',\t \"level\": $.'River Level (m)',\t \"alert\": $number($.'River Level (m)') > $number($.Alert) ? true : false\t }\t)","tot":"jsonata"}],"action":"","property":"","from":"","to":"","reg":false,"x":820,"y":140,"wires":[["d505285a.26bcb8"]]},{"id":"d505285a.26bcb8","type":"debug","z":"e7463dd2.db517","name":"","active":true,"complete":false,"x":990,"y":140,"wires":[]},{"id":"808d9c98.e305d","type":"function","z":"e7463dd2.db517","name":"cache","func":"/*jshint sub:true,asi:true,maxerr:1000*/\n// Expects input msgs with topic set \n\nconst contextVarName = 'httpReqMsgs' // homeMsgs\n\n// saved context\nvar cachedMsgs = context.get(contextVarName) || {}\n\n// Only send to single client if needed\nvar socketId = null\nif ( msg.hasOwnProperty('_socketId') ) {\n socketId = msg._socketId\n}\n\n// Replay cache if requested\nif ( msg.hasOwnProperty('cacheControl') && msg.cacheControl.toUpperCase() === 'REPLAY' ) {\n for (var topic in cachedMsgs) {\n let newMsg = {\n \"topic\": topic, \n \"payload\": cachedMsgs[topic]\n }\n // Only send to a single client if we can\n if ( socketId !== null ) newMsg._socketId = socketId\n node.send(newMsg)\n }\n return null\n}\n// -- else if --\n// Empty cache if requested\nif ( (msg.hasOwnProperty('cacheControl') && msg.cacheControl === 'RESET') ||\n (msg.payload.hasOwnProperty('cacheControl') && msg.payload.cacheControl === 'RESET') ) {\n cachedMsgs = {}\n context.set(contextVarName, cachedMsgs)\n return null\n}\n// -- else --\n\n// ignore cacheControl and uibuilder control messages\nif ( msg.hasOwnProperty('cacheControl') || msg.hasOwnProperty('uibuilderCtrl') ) return null\n\n// Add a counter for each device name\nif ( msg.topic.endsWith('$name') ) {\n let topic = msg.topic.replace('$name', '$count')\n let count = cachedMsgs[topic] || 0\n count = count + 1\n cachedMsgs[topic] = count\n let newMsg = {\n \"topic\": topic, \n \"payload\": count\n }\n // Only send to a single client if we can\n if ( socketId !== null ) newMsg._socketId = socketId\n node.send(newMsg)\n}\n\n// Keep the last msg.payload by topic\ncachedMsgs[msg.topic] = msg.payload\n\n// save context for next time\ncontext.set(contextVarName, cachedMsgs)\n\nreturn msg;","outputs":1,"noerr":0,"x":510,"y":100,"wires":[["d0660df5.4f339","c0cacccb.9e3c7"]]},{"id":"afc218de.706f88","type":"inject","z":"e7463dd2.db517","name":"replay","topic":"","payload":"","payloadType":"str","repeat":"","crontab":"","once":false,"x":110,"y":160,"wires":[["eddcc57e.277418"]]},{"id":"c0cacccb.9e3c7","type":"debug","z":"e7463dd2.db517","name":"","active":false,"complete":"true","x":650,"y":180,"wires":[]},{"id":"eddcc57e.277418","type":"change","z":"e7463dd2.db517","name":"","rules":[{"t":"set","p":"cacheControl","pt":"msg","to":"REPLAY","tot":"str"}],"action":"","property":"","from":"","to":"","reg":false,"x":300,"y":160,"wires":[["808d9c98.e305d"]]}]