HTML import: Replace cheerio with jsdom to simplify contentcollector

Cheerio provides jQuery-like objects but they wrap DOM Node-like
objects that are not 100% API compatible with the DOM spec. Because of
this, contentcollector, which is used in browsers and in Node.js
during HTML import, has until now needed to support two different
APIs. This commit modifies HTML import to use jsdom instead of cheerio
and simplifies contentcollector.
This commit is contained in:
Richard Hansen 2021-01-24 01:08:08 -05:00
parent 84d6d277d7
commit c816c20bc7
6 changed files with 330 additions and 215 deletions

View file

@ -18,7 +18,7 @@
const log4js = require('log4js');
const Changeset = require('../../static/js/Changeset');
const contentcollector = require('../../static/js/contentcollector');
const cheerio = require('cheerio');
const jsdom = require('jsdom');
const rehype = require('rehype');
const minifyWhitespace = require('rehype-minify-whitespace');
@ -31,13 +31,12 @@ exports.setPadHTML = async (pad, html) => {
html = String(output);
});
const $ = cheerio.load(html);
const {window: {document}} = new jsdom.JSDOM(html);
// Appends a line break, used by Etherpad to ensure a caret is available
// below the last line of an import
$('body').append('<p></p>');
document.body.appendChild(document.createElement('p'));
const doc = $('body')[0];
apiLogger.debug('html:');
apiLogger.debug(html);
@ -46,7 +45,7 @@ exports.setPadHTML = async (pad, html) => {
const cc = contentcollector.makeContentCollector(true, null, pad.pool);
try {
// we use a try here because if the HTML is bad it will blow up
cc.collectContent(doc);
cc.collectContent(document.body);
} catch (e) {
apiLogger.warn('HTML was not properly formed', e);