Factor out common URL regular expression code

This also eliminates the differences between the regular expressions.
2025-07-05 12:22:15 -04:00 · 2020-12-11 15:55:13 -05:00 · 2020-12-11 15:55:13 -05:00 · 7e8de5540f
commit 7e8de5540f
parent 66d0eb9a1f
4 changed files with 74 additions and 73 deletions
--- a/src/node/utils/ExportHtml.js
+++ b/src/node/utils/ExportHtml.js
@ -22,6 +22,7 @@ const hooks = require('ep_etherpad-lite/static/js/pluginfw/hooks');
 const eejs = require('ep_etherpad-lite/node/eejs');
 const _analyzeLine = require('./ExportHelper')._analyzeLine;
 const _encodeWhitespace = require('./ExportHelper')._encodeWhitespace;
 const padutils = require('../../static/js/pad_utils').padutils;
 async function getPadHTML(pad, revNum) {
  let atext = pad.atext;
@ -191,7 +192,7 @@ async function getHTMLFromAtext(pad, atext, authorColors) {
      }
    }
-    const urls = _findURLs(text);
+    const urls = padutils.findURLs(text);
    let idx = 0;
@ -459,30 +460,6 @@ exports.getPadHTMLDocument = async function (padId, revNum) {
  });
 };
 // copied from ACE
 const _REGEX_WORDCHAR = /[\u0030-\u0039\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\u0100-\u1FFF\u3040-\u9FFF\uF900-\uFDFF\uFE70-\uFEFE\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFDC]/;
 const _REGEX_SPACE = /\s/;
 const _REGEX_URLCHAR = new RegExp(`(${/[-:@a-zA-Z0-9_.,~%+\/\\?=&#;()$]/.source}|${_REGEX_WORDCHAR.source})`);
 const _REGEX_URL = new RegExp(`${/(?:(?:https?|s?ftp|ftps|file|smb|afp|nfs|(x-)?man|gopher|txmt):\/\/|mailto:)/.source + _REGEX_URLCHAR.source}*(?![:.,;])${_REGEX_URLCHAR.source}`, 'g');
 // returns null if no URLs, or [[startIndex1, url1], [startIndex2, url2], ...]
 function _findURLs(text) {
  _REGEX_URL.lastIndex = 0;
  let urls = null;
  let execResult;
  while ((execResult = _REGEX_URL.exec(text))) {
    urls = (urls || []);
    const startIndex = execResult.index;
    const url = execResult[0];
    urls.push([startIndex, url]);
  }
  return urls;
 }
 // copied from ACE
 function _processSpaces(s) {
  const doesWrap = true;
--- a/src/static/js/ace2_inner.js
+++ b/src/static/js/ace2_inner.js
@ -19,6 +19,9 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 const padutils = require('./pad_utils').padutils;
 let _, $, jQuery, plugins, Ace2Common;
 const browser = require('./browser');
 if (browser.msie) {
@ -2806,13 +2809,9 @@ function Ace2Inner() {
    }
  }
  // set of "letter or digit" chars is based on section 20.5.16 of the original Java Language Spec
  const REGEX_WORDCHAR = /[\u0030-\u0039\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\u0100-\u1FFF\u3040-\u9FFF\uF900-\uFDFF\uFE70-\uFEFE\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFDC]/;
  const REGEX_SPACE = /\s/;
-  function isWordChar(c) {
+  const isWordChar = (c) => padutils.wordCharRegex.test(c);
    return !!REGEX_WORDCHAR.exec(c);
  }
  editorInfo.ace_isWordChar = isWordChar;
  function isSpaceChar(c) {
--- a/src/static/js/linestylefilter.js
+++ b/src/static/js/linestylefilter.js
@ -33,6 +33,7 @@ const hooks = require('./pluginfw/hooks');
 const linestylefilter = {};
 const _ = require('./underscore');
 const AttributeManager = require('./AttributeManager');
 const padutils = require('./pad_utils').padutils;
 linestylefilter.ATTRIB_CLASSES = {
  bold: 'tag:b',
@ -224,11 +225,7 @@ linestylefilter.getRegexpFilter = function (regExp, tag) {
 };
-linestylefilter.REGEX_WORDCHAR = /[\u0030-\u0039\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\u0100-\u1FFF\u3040-\u9FFF\uF900-\uFDFF\uFE70-\uFEFE\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFDC]/;
+linestylefilter.getURLFilter = linestylefilter.getRegexpFilter(padutils.urlRegex, 'url');
 linestylefilter.REGEX_URLCHAR = new RegExp(`(${/[-:@a-zA-Z0-9_.,~%+\/\\?=&#!;()$]/.source}|${linestylefilter.REGEX_WORDCHAR.source})`);
 linestylefilter.REGEX_URL = new RegExp(`${/(?:(?:https?|s?ftp|ftps|file|nfs):\/\/|(about|geo|mailto|tel):|www\.)/.source + linestylefilter.REGEX_URLCHAR.source}*(?![:.,;])${linestylefilter.REGEX_URLCHAR.source}`, 'g');
 linestylefilter.getURLFilter = linestylefilter.getRegexpFilter(
    linestylefilter.REGEX_URL, 'url');
 linestylefilter.textAndClassFuncSplitter = function (func, splitPointsOpt) {
  let nextPointIndex = 0;
--- a/src/static/js/pad_utils.js
+++ b/src/static/js/pad_utils.js
@ -39,6 +39,55 @@ const randomString = (len) => {
  return randomstring;
 };
 // Set of "letter or digit" chars is based on section 20.5.16 of the original Java Language Spec.
 const wordCharRegex = new RegExp(`[${[
  '\u0030-\u0039',
  '\u0041-\u005A',
  '\u0061-\u007A',
  '\u00C0-\u00D6',
  '\u00D8-\u00F6',
  '\u00F8-\u00FF',
  '\u0100-\u1FFF',
  '\u3040-\u9FFF',
  '\uF900-\uFDFF',
  '\uFE70-\uFEFE',
  '\uFF10-\uFF19',
  '\uFF21-\uFF3A',
  '\uFF41-\uFF5A',
  '\uFF66-\uFFDC',
 ].join('')}]`);
 const urlRegex = (() => {
  // TODO: wordCharRegex matches many characters that are not permitted in URIs. Are they included
  // here as an attempt to support IRIs? (See https://tools.ietf.org/html/rfc3987.)
  const urlChar = `[-:@_.,~%+/?=&#!;()$${wordCharRegex.source.slice(1, -1)}]`;
  // Matches a single character that should not be considered part of the URL if it is the last
  // character that matches urlChar.
  const postUrlPunct = '[:.,;]';
  // Schemes that must be followed by ://
  const withAuth = `(?:${[
    '(?:x-)?man',
    'afp',
    'file',
    'ftps?',
    'gopher',
    'https?',
    'nfs',
    'sftp',
    'smb',
    'txmt',
  ].join('|')})://`;
  // Schemes that do not need to be followed by ://
  const withoutAuth = `(?:${[
    'about',
    'geo',
    'mailto',
    'tel',
  ].join('|')}):`;
  return new RegExp(
      `(?:${withAuth}|${withoutAuth}|www\\.)${urlChar}*(?!${postUrlPunct})${urlChar}`, 'g');
 })();
 const padutils = {
  escapeHtml: (x) => Security.escapeHTML(String(x)),
  uniqueId: () => {
@ -75,45 +124,24 @@ const padutils = {
    const hourmin = `${d.getHours()}:${(`0${d.getMinutes()}`).slice(-2)}`;
    return `${dayOfWeek} ${month} ${dayOfMonth} ${year} ${hourmin}`;
  },
  wordCharRegex,
  urlRegex,
  // returns null if no URLs, or [[startIndex1, url1], [startIndex2, url2], ...]
  findURLs: (text) => {
-    // copied from ACE
+    // Copy padutils.urlRegex so that the use of .exec() below (which mutates the RegExp object)
-    const _REGEX_WORDCHAR = new RegExp(`[${[
+    // does not break other concurrent uses of padutils.urlRegex.
-      '\u0030-\u0039',
+    const urlRegex = new RegExp(padutils.urlRegex, 'g');
-      '\u0041-\u005A',
+    urlRegex.lastIndex = 0;
-      '\u0061-\u007A',
+    let urls = null;
-      '\u00C0-\u00D6',
+    let execResult;
-      '\u00D8-\u00F6',
+    // TODO: Switch to String.prototype.matchAll() after support for Node.js < 12.0.0 is dropped.
-      '\u00F8-\u00FF',
+    while ((execResult = urlRegex.exec(text))) {
-      '\u0100-\u1FFF',
+      urls = (urls || []);
-      '\u3040-\u9FFF',
+      const startIndex = execResult.index;
-      '\uF900-\uFDFF',
+      const url = execResult[0];
-      '\uFE70-\uFEFE',
+      urls.push([startIndex, url]);
-      '\uFF10-\uFF19',
+    }
-      '\uFF21-\uFF3A',
+    return urls;
      '\uFF41-\uFF5A',
      '\uFF66-\uFFDC',
    ].join('')}]`);
    const _REGEX_URLCHAR = new RegExp(`([-:@a-zA-Z0-9_.,~%+/?=&#;()$]|${_REGEX_WORDCHAR.source})`);
    const _REGEX_URL = new RegExp(
        '(?:(?:https?|s?ftp|ftps|file|nfs)://|(about|geo|mailto|tel):)' +
        `${_REGEX_URLCHAR.source}*(?![:.,;])${_REGEX_URLCHAR.source}`, 'g');
    // returns null if no URLs, or [[startIndex1, url1], [startIndex2, url2], ...]
    const _findURLs = (text) => {
      _REGEX_URL.lastIndex = 0;
      let urls = null;
      let execResult;
      while ((execResult = _REGEX_URL.exec(text))) {
        urls = (urls || []);
        const startIndex = execResult.index;
        const url = execResult[0];
        urls.push([startIndex, url]);
      }
      return urls;
    };
    return _findURLs(text);
  },
  escapeHtmlWithClickableLinks: (text, target) => {
    let idx = 0;