diff --git a/src/node/utils/ExportHtml.js b/src/node/utils/ExportHtml.js index cfb294138..2f5a77c9a 100644 --- a/src/node/utils/ExportHtml.js +++ b/src/node/utils/ExportHtml.js @@ -22,6 +22,7 @@ const hooks = require('ep_etherpad-lite/static/js/pluginfw/hooks'); const eejs = require('ep_etherpad-lite/node/eejs'); const _analyzeLine = require('./ExportHelper')._analyzeLine; const _encodeWhitespace = require('./ExportHelper')._encodeWhitespace; +const padutils = require('../../static/js/pad_utils').padutils; async function getPadHTML(pad, revNum) { let atext = pad.atext; @@ -191,7 +192,7 @@ async function getHTMLFromAtext(pad, atext, authorColors) { } } - const urls = _findURLs(text); + const urls = padutils.findURLs(text); let idx = 0; @@ -459,30 +460,6 @@ exports.getPadHTMLDocument = async function (padId, revNum) { }); }; -// copied from ACE -const _REGEX_WORDCHAR = /[\u0030-\u0039\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\u0100-\u1FFF\u3040-\u9FFF\uF900-\uFDFF\uFE70-\uFEFE\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFDC]/; -const _REGEX_SPACE = /\s/; -const _REGEX_URLCHAR = new RegExp(`(${/[-:@a-zA-Z0-9_.,~%+\/\\?=&#;()$]/.source}|${_REGEX_WORDCHAR.source})`); -const _REGEX_URL = new RegExp(`${/(?:(?:https?|s?ftp|ftps|file|smb|afp|nfs|(x-)?man|gopher|txmt):\/\/|mailto:)/.source + _REGEX_URLCHAR.source}*(?![:.,;])${_REGEX_URLCHAR.source}`, 'g'); - -// returns null if no URLs, or [[startIndex1, url1], [startIndex2, url2], ...] - - -function _findURLs(text) { - _REGEX_URL.lastIndex = 0; - let urls = null; - let execResult; - while ((execResult = _REGEX_URL.exec(text))) { - urls = (urls || []); - const startIndex = execResult.index; - const url = execResult[0]; - urls.push([startIndex, url]); - } - - return urls; -} - - // copied from ACE function _processSpaces(s) { const doesWrap = true; diff --git a/src/static/js/ace2_inner.js b/src/static/js/ace2_inner.js index 06e8908e3..30eed957f 100644 --- a/src/static/js/ace2_inner.js +++ b/src/static/js/ace2_inner.js @@ -19,6 +19,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +const padutils = require('./pad_utils').padutils; + let _, $, jQuery, plugins, Ace2Common; const browser = require('./browser'); if (browser.msie) { @@ -2806,13 +2809,9 @@ function Ace2Inner() { } } - // set of "letter or digit" chars is based on section 20.5.16 of the original Java Language Spec - const REGEX_WORDCHAR = /[\u0030-\u0039\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\u0100-\u1FFF\u3040-\u9FFF\uF900-\uFDFF\uFE70-\uFEFE\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFDC]/; const REGEX_SPACE = /\s/; - function isWordChar(c) { - return !!REGEX_WORDCHAR.exec(c); - } + const isWordChar = (c) => padutils.wordCharRegex.test(c); editorInfo.ace_isWordChar = isWordChar; function isSpaceChar(c) { diff --git a/src/static/js/linestylefilter.js b/src/static/js/linestylefilter.js index b2d35bb97..f0e48268c 100644 --- a/src/static/js/linestylefilter.js +++ b/src/static/js/linestylefilter.js @@ -33,6 +33,7 @@ const hooks = require('./pluginfw/hooks'); const linestylefilter = {}; const _ = require('./underscore'); const AttributeManager = require('./AttributeManager'); +const padutils = require('./pad_utils').padutils; linestylefilter.ATTRIB_CLASSES = { bold: 'tag:b', @@ -224,11 +225,7 @@ linestylefilter.getRegexpFilter = function (regExp, tag) { }; -linestylefilter.REGEX_WORDCHAR = /[\u0030-\u0039\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\u0100-\u1FFF\u3040-\u9FFF\uF900-\uFDFF\uFE70-\uFEFE\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFDC]/; -linestylefilter.REGEX_URLCHAR = new RegExp(`(${/[-:@a-zA-Z0-9_.,~%+\/\\?=&#!;()$]/.source}|${linestylefilter.REGEX_WORDCHAR.source})`); -linestylefilter.REGEX_URL = new RegExp(`${/(?:(?:https?|s?ftp|ftps|file|nfs):\/\/|(about|geo|mailto|tel):|www\.)/.source + linestylefilter.REGEX_URLCHAR.source}*(?![:.,;])${linestylefilter.REGEX_URLCHAR.source}`, 'g'); -linestylefilter.getURLFilter = linestylefilter.getRegexpFilter( - linestylefilter.REGEX_URL, 'url'); +linestylefilter.getURLFilter = linestylefilter.getRegexpFilter(padutils.urlRegex, 'url'); linestylefilter.textAndClassFuncSplitter = function (func, splitPointsOpt) { let nextPointIndex = 0; diff --git a/src/static/js/pad_utils.js b/src/static/js/pad_utils.js index 1fdf7cd6c..484b2aabb 100644 --- a/src/static/js/pad_utils.js +++ b/src/static/js/pad_utils.js @@ -39,6 +39,55 @@ const randomString = (len) => { return randomstring; }; +// Set of "letter or digit" chars is based on section 20.5.16 of the original Java Language Spec. +const wordCharRegex = new RegExp(`[${[ + '\u0030-\u0039', + '\u0041-\u005A', + '\u0061-\u007A', + '\u00C0-\u00D6', + '\u00D8-\u00F6', + '\u00F8-\u00FF', + '\u0100-\u1FFF', + '\u3040-\u9FFF', + '\uF900-\uFDFF', + '\uFE70-\uFEFE', + '\uFF10-\uFF19', + '\uFF21-\uFF3A', + '\uFF41-\uFF5A', + '\uFF66-\uFFDC', +].join('')}]`); + +const urlRegex = (() => { + // TODO: wordCharRegex matches many characters that are not permitted in URIs. Are they included + // here as an attempt to support IRIs? (See https://tools.ietf.org/html/rfc3987.) + const urlChar = `[-:@_.,~%+/?=&#!;()$${wordCharRegex.source.slice(1, -1)}]`; + // Matches a single character that should not be considered part of the URL if it is the last + // character that matches urlChar. + const postUrlPunct = '[:.,;]'; + // Schemes that must be followed by :// + const withAuth = `(?:${[ + '(?:x-)?man', + 'afp', + 'file', + 'ftps?', + 'gopher', + 'https?', + 'nfs', + 'sftp', + 'smb', + 'txmt', + ].join('|')})://`; + // Schemes that do not need to be followed by :// + const withoutAuth = `(?:${[ + 'about', + 'geo', + 'mailto', + 'tel', + ].join('|')}):`; + return new RegExp( + `(?:${withAuth}|${withoutAuth}|www\\.)${urlChar}*(?!${postUrlPunct})${urlChar}`, 'g'); +})(); + const padutils = { escapeHtml: (x) => Security.escapeHTML(String(x)), uniqueId: () => { @@ -75,45 +124,24 @@ const padutils = { const hourmin = `${d.getHours()}:${(`0${d.getMinutes()}`).slice(-2)}`; return `${dayOfWeek} ${month} ${dayOfMonth} ${year} ${hourmin}`; }, + wordCharRegex, + urlRegex, + // returns null if no URLs, or [[startIndex1, url1], [startIndex2, url2], ...] findURLs: (text) => { - // copied from ACE - const _REGEX_WORDCHAR = new RegExp(`[${[ - '\u0030-\u0039', - '\u0041-\u005A', - '\u0061-\u007A', - '\u00C0-\u00D6', - '\u00D8-\u00F6', - '\u00F8-\u00FF', - '\u0100-\u1FFF', - '\u3040-\u9FFF', - '\uF900-\uFDFF', - '\uFE70-\uFEFE', - '\uFF10-\uFF19', - '\uFF21-\uFF3A', - '\uFF41-\uFF5A', - '\uFF66-\uFFDC', - ].join('')}]`); - const _REGEX_URLCHAR = new RegExp(`([-:@a-zA-Z0-9_.,~%+/?=&#;()$]|${_REGEX_WORDCHAR.source})`); - const _REGEX_URL = new RegExp( - '(?:(?:https?|s?ftp|ftps|file|nfs)://|(about|geo|mailto|tel):)' + - `${_REGEX_URLCHAR.source}*(?![:.,;])${_REGEX_URLCHAR.source}`, 'g'); - - // returns null if no URLs, or [[startIndex1, url1], [startIndex2, url2], ...] - const _findURLs = (text) => { - _REGEX_URL.lastIndex = 0; - let urls = null; - let execResult; - while ((execResult = _REGEX_URL.exec(text))) { - urls = (urls || []); - const startIndex = execResult.index; - const url = execResult[0]; - urls.push([startIndex, url]); - } - - return urls; - }; - - return _findURLs(text); + // Copy padutils.urlRegex so that the use of .exec() below (which mutates the RegExp object) + // does not break other concurrent uses of padutils.urlRegex. + const urlRegex = new RegExp(padutils.urlRegex, 'g'); + urlRegex.lastIndex = 0; + let urls = null; + let execResult; + // TODO: Switch to String.prototype.matchAll() after support for Node.js < 12.0.0 is dropped. + while ((execResult = urlRegex.exec(text))) { + urls = (urls || []); + const startIndex = execResult.index; + const url = execResult[0]; + urls.push([startIndex, url]); + } + return urls; }, escapeHtmlWithClickableLinks: (text, target) => { let idx = 0;