const cheerio = require('cheerio');
const minify = require('html-minifier').minify;
function chunk(arr, len) {
let chunks = [];
let i = 0;
let n = arr.length;
while (i < n) {
chunks.push(arr.slice(i, (i += len)));
}
return chunks;
}
/**
* @typedef {Object} MensaplanResponse
* @property {Object} json parsed json object containing all days as keys and days/ food items as object values
* @property {string} html human readable html version of mensaplan
* @property {string} hinweis note of the institution/ kitchen
* @property {Array} categories array of available categories in the mensaplan
* @property {string} timeRange start and end date of mensaplan along with calendar week if available
* @property {Array} days array of days in this mensaplan
* @property {Array} elements_unchunked unchunked array of food/ day items in this mensaplan
*/
/**
* @returns {MensaplanResponse} MensaplanResponse
*/
exports.parser = (input) => {
return new Promise(function (resolve, reject) {
try {
let tmp = cheerio.load(input);
const timeRange = tmp('#lblWoche').text();
// drop all attributes + unneeded elements for better parsing
tmp('*')
.removeAttr('style')
.removeAttr('class')
.removeAttr('valign')
.removeAttr('colspan')
.removeAttr('align')
.removeAttr('border')
.removeAttr('cellpadding')
.removeAttr('alt')
.removeAttr('title')
.removeAttr('onclick');
tmp('img').parent().remove();
tmp('input').remove();
tmp('script').remove();
tmp('#strDetails').parent().parent().remove();
tmp('td').removeAttr('id');
tmp('tr').removeAttr('id');
tmp = tmp('#tblMain').parent().html();
tmp = tmp.replaceAll('
• | ', '');
// minify html for easier parsing
tmp = minify(tmp, {
useShortDoctype: true,
minifyCSS: true,
collapseWhitespace: true
});
// remove empty food items
tmp = tmp.replaceAll(' |
', '');
// remove empty row at end of table
tmp = tmp.replaceAll(
' | | | | | ',
''
);
// replace tr,div,td with food+day elements for readability
tmp = tmp.replaceAll(
' | | | | | ',
''
);
tmp = tmp.replaceAll(
'',
' '
);
tmp = tmp.replaceAll(
'',
''
);
tmp = tmp.replaceAll(
' | ',
''
);
tmp = tmp.replaceAll(
' |
|
| ',
''
);
tmp = tmp.replaceAll(
' | ',
''
);
tmp = tmp.replaceAll(' | ', '');
tmp = tmp.replaceAll(' | ', '');
tmp = tmp.replaceAll('| ', ' | ');
// fix end of file invalid markup
tmp = tmp.replaceAll(' | ', ' | ');
// remove leading IDs from foods
tmp = tmp.replaceAll(/\d+ /g, '');
// remove trailing whitespace from foods
tmp = tmp.replaceAll(' ', '');
// at this point, the html/ xml in 'tmp' is pretty readable
// begin parsing: load html into cheerio object
let $ = cheerio.load(input);
const hinweis = $('#lblSpeiesplanHinweis').text();
let days = [];
$('.tdHeader th').each((i, e) => {
days.push($(e).html());
});
days = days.filter((h) => h !== '');
// load preprocessed html into cheerio object
$ = cheerio.load(tmp);
// parse markup to arrays
const $1 = cheerio.load(tmp);
let categories = [];
let elements = [];
$1('td').each(function (index, element) {
categories.push($1(element).text());
});
$1('day').each(function (index, element) {
const $2 = cheerio.load($1(element).html());
let items = [];
$2('food').each(function (index, element) {
const $3 = cheerio.load($2(element).html());
let additives_allergies = [];
$3('span').each(function (index, element) {
additives_allergies.push($3(element).text());
});
$3('sub').remove();
items.push({ title: $3.text(), additives_allergies });
});
elements.push(items);
});
$ = cheerio.load(input);
const elements_unchunked = elements;
elements = chunk(elements, days.length);
// parse elements into final json structure
let out = {};
let index = 0;
categories.forEach((c) => {
let i = 0;
days.forEach((d) => {
if (!out[`${days[i]}`]) {
out[`${days[i]}`] = {};
}
out[`${days[i]}`][`${categories[index]}`] =
elements[index][i];
i++;
});
index++;
});
resolve({
json: out,
html: tmp,
hinweis,
categories,
timeRange,
days,
elements_unchunked
});
} catch (e) {
reject(e);
}
});
};
|