From 1dc965d9a8581f2ca18c61214011a8a48089dd6e Mon Sep 17 00:00:00 2001 From: lionel-rowe Date: Wed, 15 May 2024 10:53:37 +0800 Subject: [PATCH] Always escape ASCII chars with special meaning --- .../text-to-unicode.e2e.spec.ts | 10 +++--- .../text-to-unicode.service.test.ts | 35 +++++++++++++++++-- .../text-to-unicode.service.ts | 34 +++++++++--------- src/tools/text-to-unicode/text-to-unicode.vue | 6 ++-- 4 files changed, 57 insertions(+), 28 deletions(-) diff --git a/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts b/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts index 761828fd..139c38b2 100644 --- a/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts +++ b/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts @@ -10,16 +10,18 @@ test.describe('Tool - Text to Unicode', () => { }); test('Text to unicode conversion', async ({ page }) => { - await page.getByTestId('text-to-unicode-input').fill('it-tools'); + await page.getByTestId('text-to-unicode-input').fill('"it-tools" 文字'); const unicode = await page.getByTestId('text-to-unicode-output').inputValue(); - expect(unicode).toEqual('it-tools'); + // eslint-disable-next-line unicorn/escape-case + expect(unicode).toEqual(String.raw`\u0022it-tools\u0022 \u6587\u5b57`); }); test('Unicode to text conversion', async ({ page }) => { - await page.getByTestId('unicode-to-text-input').fill('it-tools'); + // eslint-disable-next-line unicorn/escape-case + await page.getByTestId('unicode-to-text-input').fill(String.raw`\u0022it-tools\u0022 \u6587\u5b57`); const text = await page.getByTestId('unicode-to-text-output').inputValue(); - expect(text).toEqual('it-tools'); + expect(text).toEqual('"it-tools" 文字'); }); }); diff --git a/src/tools/text-to-unicode/text-to-unicode.service.test.ts b/src/tools/text-to-unicode/text-to-unicode.service.test.ts index 6b81a7c7..94a6fc62 100644 --- a/src/tools/text-to-unicode/text-to-unicode.service.test.ts +++ b/src/tools/text-to-unicode/text-to-unicode.service.test.ts @@ -1,8 +1,8 @@ import { describe, expect, it } from 'vitest'; -import { type ConverterId, SKIP_PRINTABLE_ASCII_RE, converters } from './text-to-unicode.service'; +import { type ConverterId, SKIP_ASCII_HTML, SKIP_ASCII_JS, converters } from './text-to-unicode.service'; describe('text-to-unicode (legacy tests)', () => { - const convertTextToUnicode = converters.decimalEntities.escape; + const convertTextToUnicode = (text: string) => converters.decimalEntities.escape(text, false); const convertUnicodeToText = converters.decimalEntities.unescape; describe('convertTextToUnicode', () => { @@ -22,6 +22,23 @@ describe('text-to-unicode (legacy tests)', () => { }); }); +const ALL_PRINTABLE_ASCII = ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~'; + +describe('text-to-unicode regexes', () => { + // eslint-disable-next-line prefer-regex-literals + const skipAsciiJs = new RegExp(String.raw`([[ -~]--['"\\]]+)`, 'gv'); + // eslint-disable-next-line prefer-regex-literals + const skipAsciiHtml = new RegExp(String.raw`([[ -~]--[<>&'"]]+)`, 'gv'); + + it('regexes are equivalent to `v`-flag versions', () => { + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicodeSets + // regexes in `text-to-unicode.service.ts` can be replaced with `v`-flag versions once unicodeSets reaches + // sufficient in-browser support + expect(ALL_PRINTABLE_ASCII.match(skipAsciiJs)).toStrictEqual(ALL_PRINTABLE_ASCII.match(SKIP_ASCII_JS)); + expect(ALL_PRINTABLE_ASCII.match(skipAsciiHtml)).toStrictEqual(ALL_PRINTABLE_ASCII.match(SKIP_ASCII_HTML)); + }); +}); + describe('text-to-unicode', () => { interface TestConfig { text: string @@ -48,6 +65,18 @@ describe('text-to-unicode', () => { decimalEntities: 'ABC', }, }, + { + text: ALL_PRINTABLE_ASCII, + skipPrintableAscii: true, + results: { + // eslint-disable-next-line unicorn/escape-case + fullUnicode: String.raw` !\u0022#$%&\u0027()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005c]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`, + // eslint-disable-next-line unicorn/escape-case + utf16: String.raw` !\u0022#$%&\u0027()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005c]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`, + hexEntities: String.raw` !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`, + decimalEntities: String.raw` !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`, + }, + }, { text: '文字', results: { @@ -79,7 +108,7 @@ describe('text-to-unicode', () => { describe(key, () => { const converter = converters[key as ConverterId]; it('Escaping', () => { - expect(converter.escape(text, skipAscii ? SKIP_PRINTABLE_ASCII_RE : undefined)).toBe(result); + expect(converter.escape(text, skipAscii)).toBe(result); }); it('Unescaping', () => { expect(converter.unescape(result)).toBe(text); diff --git a/src/tools/text-to-unicode/text-to-unicode.service.ts b/src/tools/text-to-unicode/text-to-unicode.service.ts index c5cadf88..22568044 100644 --- a/src/tools/text-to-unicode/text-to-unicode.service.ts +++ b/src/tools/text-to-unicode/text-to-unicode.service.ts @@ -1,37 +1,35 @@ // regex that never matches const SKIP_NOTHING_RE = /(\b\B)/; -export const SKIP_PRINTABLE_ASCII_RE = /([ -~]+)/g; +export const SKIP_ASCII_JS = /([ -!#-&(-\[\]-~]+)/g; +export const SKIP_ASCII_HTML = /([ -!#-%(-;=?-~]+)/g; -function _codeUnits(text: string): number[] { +function codeUnits(text: string): number[] { return text.split('').map(char => char.codePointAt(0)); } -function _codePoints(text: string): number[] { +function codePoints(text: string): number[] { return [...text].map(char => char.codePointAt(0)); } export interface Converter { name: string - escape(text: string, skip: RegExp): string + escape(text: string, skipAscii: boolean): string unescape(text: string): string }; interface EscapeConfig { - getCharValues?(text: string): number[] + charValues?(text: string): number[] mapper(charValue: number): string + /** @prop regular expression for default content to skip. Must have exactly 1 capture group. */ + asciiSkipper: RegExp }; -function escaper({ getCharValues, mapper }: EscapeConfig) { - /** - * @param text text input to escape - * @param skipper regular expression for content _not_ to escape. Must have exactly 1 capture group. - */ - return (text: string, skipper?: RegExp): string => { - skipper ??= SKIP_NOTHING_RE; - getCharValues ??= _codePoints; +function escaper({ charValues: getCharValues, mapper, asciiSkipper: skipper }: EscapeConfig) { + return (text: string, skip: boolean): string => { + getCharValues ??= codePoints; return text - .split(skipper) + .split(skip ? skipper : SKIP_NOTHING_RE) .flatMap((x, i) => { if (i % 2) { return x; @@ -59,22 +57,22 @@ export type ConverterId = keyof typeof converters; const converters = { fullUnicode: { name: 'Full Unicode', - escape: escaper({ mapper: convertCodePointToUnicode }), + escape: escaper({ mapper: convertCodePointToUnicode, asciiSkipper: SKIP_ASCII_JS }), unescape: unescaper({ regex: /\\u\p{AHex}{4}|\\u\{\p{AHex}{1,6}\}/gu, radix: 16 }), }, utf16: { name: 'UTF-16 Code Units', - escape: escaper({ getCharValues: _codeUnits, mapper: convertCodePointToUnicode }), + escape: escaper({ charValues: codeUnits, mapper: convertCodePointToUnicode, asciiSkipper: SKIP_ASCII_JS }), unescape: unescaper({ regex: /\\u\p{AHex}{4}/gu, radix: 16 }), }, hexEntities: { name: 'HTML Entities (Hex)', - escape: escaper({ mapper: toHexEntities }), + escape: escaper({ mapper: toHexEntities, asciiSkipper: SKIP_ASCII_HTML }), unescape: unescaper({ regex: /&#x\p{AHex}{1,6};/gu, radix: 16 }), }, decimalEntities: { name: 'HTML Entities (Decimal)', - escape: escaper({ mapper: toDecimalEntities }), + escape: escaper({ mapper: toDecimalEntities, asciiSkipper: SKIP_ASCII_HTML }), unescape: unescaper({ regex: /&#\d+;/gu, radix: 10 }), }, } satisfies Record; diff --git a/src/tools/text-to-unicode/text-to-unicode.vue b/src/tools/text-to-unicode/text-to-unicode.vue index e4c58f19..ed185975 100644 --- a/src/tools/text-to-unicode/text-to-unicode.vue +++ b/src/tools/text-to-unicode/text-to-unicode.vue @@ -1,5 +1,5 @@