From b0ae8d7b3dda98d056a0814051ff5ebf45388f16 Mon Sep 17 00:00:00 2001 From: lionel-rowe Date: Tue, 14 May 2024 21:45:31 +0800 Subject: [PATCH 1/4] fix(text-to-unicode): handle non-BMP + more conversion options --- .../text-to-unicode.service.test.ts | 75 +++++++++++- .../text-to-unicode.service.ts | 96 ++++++++++++++- src/tools/text-to-unicode/text-to-unicode.vue | 114 ++++++++++++++---- 3 files changed, 257 insertions(+), 28 deletions(-) diff --git a/src/tools/text-to-unicode/text-to-unicode.service.test.ts b/src/tools/text-to-unicode/text-to-unicode.service.test.ts index bda4fa7a..6b81a7c7 100644 --- a/src/tools/text-to-unicode/text-to-unicode.service.test.ts +++ b/src/tools/text-to-unicode/text-to-unicode.service.test.ts @@ -1,7 +1,10 @@ import { describe, expect, it } from 'vitest'; -import { convertTextToUnicode, convertUnicodeToText } from './text-to-unicode.service'; +import { type ConverterId, SKIP_PRINTABLE_ASCII_RE, converters } from './text-to-unicode.service'; + +describe('text-to-unicode (legacy tests)', () => { + const convertTextToUnicode = converters.decimalEntities.escape; + const convertUnicodeToText = converters.decimalEntities.unescape; -describe('text-to-unicode', () => { describe('convertTextToUnicode', () => { it('a text string is converted to unicode representation', () => { expect(convertTextToUnicode('A')).toBe('A'); @@ -18,3 +21,71 @@ describe('text-to-unicode', () => { }); }); }); + +describe('text-to-unicode', () => { + interface TestConfig { + text: string + results: Record + skipPrintableAscii?: boolean + }; + const tests: TestConfig[] = [ + { + text: 'ABC', + results: { + fullUnicode: String.raw`\u0041\u0042\u0043`, + utf16: String.raw`\u0041\u0042\u0043`, + hexEntities: String.raw`ABC`, + decimalEntities: String.raw`ABC`, + }, + }, + { + text: 'ABC', + skipPrintableAscii: true, + results: { + fullUnicode: 'ABC', + utf16: 'ABC', + hexEntities: 'ABC', + decimalEntities: 'ABC', + }, + }, + { + text: '文字', + results: { + // eslint-disable-next-line unicorn/escape-case + fullUnicode: String.raw`\u6587\u5b57`, + // eslint-disable-next-line unicorn/escape-case + utf16: String.raw`\u6587\u5b57`, + hexEntities: String.raw`文字`, + decimalEntities: String.raw`文字`, + }, + }, + { + text: 'a 💩 b', + skipPrintableAscii: true, + results: { + // eslint-disable-next-line unicorn/escape-case + fullUnicode: String.raw`a \u{1f4a9} b`, + // eslint-disable-next-line unicorn/escape-case + utf16: String.raw`a \ud83d\udca9 b`, + hexEntities: String.raw`a 💩 b`, + decimalEntities: String.raw`a 💩 b`, + }, + }, + ]; + + for (const { text, skipPrintableAscii: skipAscii, results } of tests) { + describe(`${text} (skipAscii=${skipAscii})`, () => { + for (const [key, result] of Object.entries(results)) { + describe(key, () => { + const converter = converters[key as ConverterId]; + it('Escaping', () => { + expect(converter.escape(text, skipAscii ? SKIP_PRINTABLE_ASCII_RE : undefined)).toBe(result); + }); + it('Unescaping', () => { + expect(converter.unescape(result)).toBe(text); + }); + }); + } + }); + } +}); diff --git a/src/tools/text-to-unicode/text-to-unicode.service.ts b/src/tools/text-to-unicode/text-to-unicode.service.ts index e7772cf8..c5cadf88 100644 --- a/src/tools/text-to-unicode/text-to-unicode.service.ts +++ b/src/tools/text-to-unicode/text-to-unicode.service.ts @@ -1,9 +1,95 @@ -function convertTextToUnicode(text: string): string { - return text.split('').map(value => `&#${value.charCodeAt(0)};`).join(''); +// regex that never matches +const SKIP_NOTHING_RE = /(\b\B)/; +export const SKIP_PRINTABLE_ASCII_RE = /([ -~]+)/g; + +function _codeUnits(text: string): number[] { + return text.split('').map(char => char.codePointAt(0)); } -function convertUnicodeToText(unicodeStr: string): string { - return unicodeStr.replace(/&#(\d+);/g, (match, dec) => String.fromCharCode(dec)); +function _codePoints(text: string): number[] { + return [...text].map(char => char.codePointAt(0)); } -export { convertTextToUnicode, convertUnicodeToText }; +export interface Converter { + name: string + escape(text: string, skip: RegExp): string + unescape(text: string): string +}; + +interface EscapeConfig { + getCharValues?(text: string): number[] + mapper(charValue: number): string +}; + +function escaper({ getCharValues, mapper }: EscapeConfig) { + /** + * @param text text input to escape + * @param skipper regular expression for content _not_ to escape. Must have exactly 1 capture group. + */ + return (text: string, skipper?: RegExp): string => { + skipper ??= SKIP_NOTHING_RE; + getCharValues ??= _codePoints; + + return text + .split(skipper) + .flatMap((x, i) => { + if (i % 2) { + return x; + } + return getCharValues(x).map(mapper); + }) + .join(''); + }; +} + +interface UnescapeConfig { + regex: RegExp + radix: number +}; + +function unescaper({ regex, radix }: UnescapeConfig) { + return (escaped: string): string => { + return escaped.replace(regex, (match) => { + return String.fromCodePoint(Number.parseInt(match.replace(/\P{AHex}/gu, ''), radix)); + }); + }; +} + +export type ConverterId = keyof typeof converters; +const converters = { + fullUnicode: { + name: 'Full Unicode', + escape: escaper({ mapper: convertCodePointToUnicode }), + unescape: unescaper({ regex: /\\u\p{AHex}{4}|\\u\{\p{AHex}{1,6}\}/gu, radix: 16 }), + }, + utf16: { + name: 'UTF-16 Code Units', + escape: escaper({ getCharValues: _codeUnits, mapper: convertCodePointToUnicode }), + unescape: unescaper({ regex: /\\u\p{AHex}{4}/gu, radix: 16 }), + }, + hexEntities: { + name: 'HTML Entities (Hex)', + escape: escaper({ mapper: toHexEntities }), + unescape: unescaper({ regex: /&#x\p{AHex}{1,6};/gu, radix: 16 }), + }, + decimalEntities: { + name: 'HTML Entities (Decimal)', + escape: escaper({ mapper: toDecimalEntities }), + unescape: unescaper({ regex: /&#\d+;/gu, radix: 10 }), + }, +} satisfies Record; + +function convertCodePointToUnicode(codePoint: number): string { + const hex = codePoint.toString(16); + return hex.length > 4 ? String.raw`\u{${hex}}` : String.raw`\u${hex.padStart(4, '0')}`; +} + +function toHexEntities(codePoint: number): string { + return `&#x${codePoint.toString(16)};`; +} + +function toDecimalEntities(codePoint: number): string { + return `&#${codePoint};`; +} + +export { converters }; diff --git a/src/tools/text-to-unicode/text-to-unicode.vue b/src/tools/text-to-unicode/text-to-unicode.vue index be9bed86..e4c58f19 100644 --- a/src/tools/text-to-unicode/text-to-unicode.vue +++ b/src/tools/text-to-unicode/text-to-unicode.vue @@ -1,34 +1,106 @@ + + From 1dc965d9a8581f2ca18c61214011a8a48089dd6e Mon Sep 17 00:00:00 2001 From: lionel-rowe Date: Wed, 15 May 2024 10:53:37 +0800 Subject: [PATCH 2/4] Always escape ASCII chars with special meaning --- .../text-to-unicode.e2e.spec.ts | 10 +++--- .../text-to-unicode.service.test.ts | 35 +++++++++++++++++-- .../text-to-unicode.service.ts | 34 +++++++++--------- src/tools/text-to-unicode/text-to-unicode.vue | 6 ++-- 4 files changed, 57 insertions(+), 28 deletions(-) diff --git a/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts b/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts index 761828fd..139c38b2 100644 --- a/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts +++ b/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts @@ -10,16 +10,18 @@ test.describe('Tool - Text to Unicode', () => { }); test('Text to unicode conversion', async ({ page }) => { - await page.getByTestId('text-to-unicode-input').fill('it-tools'); + await page.getByTestId('text-to-unicode-input').fill('"it-tools" 文字'); const unicode = await page.getByTestId('text-to-unicode-output').inputValue(); - expect(unicode).toEqual('it-tools'); + // eslint-disable-next-line unicorn/escape-case + expect(unicode).toEqual(String.raw`\u0022it-tools\u0022 \u6587\u5b57`); }); test('Unicode to text conversion', async ({ page }) => { - await page.getByTestId('unicode-to-text-input').fill('it-tools'); + // eslint-disable-next-line unicorn/escape-case + await page.getByTestId('unicode-to-text-input').fill(String.raw`\u0022it-tools\u0022 \u6587\u5b57`); const text = await page.getByTestId('unicode-to-text-output').inputValue(); - expect(text).toEqual('it-tools'); + expect(text).toEqual('"it-tools" 文字'); }); }); diff --git a/src/tools/text-to-unicode/text-to-unicode.service.test.ts b/src/tools/text-to-unicode/text-to-unicode.service.test.ts index 6b81a7c7..94a6fc62 100644 --- a/src/tools/text-to-unicode/text-to-unicode.service.test.ts +++ b/src/tools/text-to-unicode/text-to-unicode.service.test.ts @@ -1,8 +1,8 @@ import { describe, expect, it } from 'vitest'; -import { type ConverterId, SKIP_PRINTABLE_ASCII_RE, converters } from './text-to-unicode.service'; +import { type ConverterId, SKIP_ASCII_HTML, SKIP_ASCII_JS, converters } from './text-to-unicode.service'; describe('text-to-unicode (legacy tests)', () => { - const convertTextToUnicode = converters.decimalEntities.escape; + const convertTextToUnicode = (text: string) => converters.decimalEntities.escape(text, false); const convertUnicodeToText = converters.decimalEntities.unescape; describe('convertTextToUnicode', () => { @@ -22,6 +22,23 @@ describe('text-to-unicode (legacy tests)', () => { }); }); +const ALL_PRINTABLE_ASCII = ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~'; + +describe('text-to-unicode regexes', () => { + // eslint-disable-next-line prefer-regex-literals + const skipAsciiJs = new RegExp(String.raw`([[ -~]--['"\\]]+)`, 'gv'); + // eslint-disable-next-line prefer-regex-literals + const skipAsciiHtml = new RegExp(String.raw`([[ -~]--[<>&'"]]+)`, 'gv'); + + it('regexes are equivalent to `v`-flag versions', () => { + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicodeSets + // regexes in `text-to-unicode.service.ts` can be replaced with `v`-flag versions once unicodeSets reaches + // sufficient in-browser support + expect(ALL_PRINTABLE_ASCII.match(skipAsciiJs)).toStrictEqual(ALL_PRINTABLE_ASCII.match(SKIP_ASCII_JS)); + expect(ALL_PRINTABLE_ASCII.match(skipAsciiHtml)).toStrictEqual(ALL_PRINTABLE_ASCII.match(SKIP_ASCII_HTML)); + }); +}); + describe('text-to-unicode', () => { interface TestConfig { text: string @@ -48,6 +65,18 @@ describe('text-to-unicode', () => { decimalEntities: 'ABC', }, }, + { + text: ALL_PRINTABLE_ASCII, + skipPrintableAscii: true, + results: { + // eslint-disable-next-line unicorn/escape-case + fullUnicode: String.raw` !\u0022#$%&\u0027()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005c]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`, + // eslint-disable-next-line unicorn/escape-case + utf16: String.raw` !\u0022#$%&\u0027()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005c]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`, + hexEntities: String.raw` !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`, + decimalEntities: String.raw` !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`, + }, + }, { text: '文字', results: { @@ -79,7 +108,7 @@ describe('text-to-unicode', () => { describe(key, () => { const converter = converters[key as ConverterId]; it('Escaping', () => { - expect(converter.escape(text, skipAscii ? SKIP_PRINTABLE_ASCII_RE : undefined)).toBe(result); + expect(converter.escape(text, skipAscii)).toBe(result); }); it('Unescaping', () => { expect(converter.unescape(result)).toBe(text); diff --git a/src/tools/text-to-unicode/text-to-unicode.service.ts b/src/tools/text-to-unicode/text-to-unicode.service.ts index c5cadf88..22568044 100644 --- a/src/tools/text-to-unicode/text-to-unicode.service.ts +++ b/src/tools/text-to-unicode/text-to-unicode.service.ts @@ -1,37 +1,35 @@ // regex that never matches const SKIP_NOTHING_RE = /(\b\B)/; -export const SKIP_PRINTABLE_ASCII_RE = /([ -~]+)/g; +export const SKIP_ASCII_JS = /([ -!#-&(-\[\]-~]+)/g; +export const SKIP_ASCII_HTML = /([ -!#-%(-;=?-~]+)/g; -function _codeUnits(text: string): number[] { +function codeUnits(text: string): number[] { return text.split('').map(char => char.codePointAt(0)); } -function _codePoints(text: string): number[] { +function codePoints(text: string): number[] { return [...text].map(char => char.codePointAt(0)); } export interface Converter { name: string - escape(text: string, skip: RegExp): string + escape(text: string, skipAscii: boolean): string unescape(text: string): string }; interface EscapeConfig { - getCharValues?(text: string): number[] + charValues?(text: string): number[] mapper(charValue: number): string + /** @prop regular expression for default content to skip. Must have exactly 1 capture group. */ + asciiSkipper: RegExp }; -function escaper({ getCharValues, mapper }: EscapeConfig) { - /** - * @param text text input to escape - * @param skipper regular expression for content _not_ to escape. Must have exactly 1 capture group. - */ - return (text: string, skipper?: RegExp): string => { - skipper ??= SKIP_NOTHING_RE; - getCharValues ??= _codePoints; +function escaper({ charValues: getCharValues, mapper, asciiSkipper: skipper }: EscapeConfig) { + return (text: string, skip: boolean): string => { + getCharValues ??= codePoints; return text - .split(skipper) + .split(skip ? skipper : SKIP_NOTHING_RE) .flatMap((x, i) => { if (i % 2) { return x; @@ -59,22 +57,22 @@ export type ConverterId = keyof typeof converters; const converters = { fullUnicode: { name: 'Full Unicode', - escape: escaper({ mapper: convertCodePointToUnicode }), + escape: escaper({ mapper: convertCodePointToUnicode, asciiSkipper: SKIP_ASCII_JS }), unescape: unescaper({ regex: /\\u\p{AHex}{4}|\\u\{\p{AHex}{1,6}\}/gu, radix: 16 }), }, utf16: { name: 'UTF-16 Code Units', - escape: escaper({ getCharValues: _codeUnits, mapper: convertCodePointToUnicode }), + escape: escaper({ charValues: codeUnits, mapper: convertCodePointToUnicode, asciiSkipper: SKIP_ASCII_JS }), unescape: unescaper({ regex: /\\u\p{AHex}{4}/gu, radix: 16 }), }, hexEntities: { name: 'HTML Entities (Hex)', - escape: escaper({ mapper: toHexEntities }), + escape: escaper({ mapper: toHexEntities, asciiSkipper: SKIP_ASCII_HTML }), unescape: unescaper({ regex: /&#x\p{AHex}{1,6};/gu, radix: 16 }), }, decimalEntities: { name: 'HTML Entities (Decimal)', - escape: escaper({ mapper: toDecimalEntities }), + escape: escaper({ mapper: toDecimalEntities, asciiSkipper: SKIP_ASCII_HTML }), unescape: unescaper({ regex: /&#\d+;/gu, radix: 10 }), }, } satisfies Record; diff --git a/src/tools/text-to-unicode/text-to-unicode.vue b/src/tools/text-to-unicode/text-to-unicode.vue index e4c58f19..ed185975 100644 --- a/src/tools/text-to-unicode/text-to-unicode.vue +++ b/src/tools/text-to-unicode/text-to-unicode.vue @@ -1,5 +1,5 @@