diff --git a/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts b/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts index 761828fd..139c38b2 100644 --- a/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts +++ b/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts @@ -10,16 +10,18 @@ test.describe('Tool - Text to Unicode', () => { }); test('Text to unicode conversion', async ({ page }) => { - await page.getByTestId('text-to-unicode-input').fill('it-tools'); + await page.getByTestId('text-to-unicode-input').fill('"it-tools" 文字'); const unicode = await page.getByTestId('text-to-unicode-output').inputValue(); - expect(unicode).toEqual('it-tools'); + // eslint-disable-next-line unicorn/escape-case + expect(unicode).toEqual(String.raw`\u0022it-tools\u0022 \u6587\u5b57`); }); test('Unicode to text conversion', async ({ page }) => { - await page.getByTestId('unicode-to-text-input').fill('it-tools'); + // eslint-disable-next-line unicorn/escape-case + await page.getByTestId('unicode-to-text-input').fill(String.raw`\u0022it-tools\u0022 \u6587\u5b57`); const text = await page.getByTestId('unicode-to-text-output').inputValue(); - expect(text).toEqual('it-tools'); + expect(text).toEqual('"it-tools" 文字'); }); }); diff --git a/src/tools/text-to-unicode/text-to-unicode.service.test.ts b/src/tools/text-to-unicode/text-to-unicode.service.test.ts index bda4fa7a..3c79622a 100644 --- a/src/tools/text-to-unicode/text-to-unicode.service.test.ts +++ b/src/tools/text-to-unicode/text-to-unicode.service.test.ts @@ -1,7 +1,10 @@ import { describe, expect, it } from 'vitest'; -import { convertTextToUnicode, convertUnicodeToText } from './text-to-unicode.service'; +import { type ConverterId, SKIP_ASCII_HTML, SKIP_ASCII_JS, converters } from './text-to-unicode.service'; + +describe('text-to-unicode (legacy tests)', () => { + const convertTextToUnicode = (text: string) => converters.decimalEntities.escape(text, false); + const convertUnicodeToText = (escaped: string) => converters.decimalEntities.unescape(escaped); -describe('text-to-unicode', () => { describe('convertTextToUnicode', () => { it('a text string is converted to unicode representation', () => { expect(convertTextToUnicode('A')).toBe('A'); @@ -18,3 +21,100 @@ describe('text-to-unicode', () => { }); }); }); + +const ALL_PRINTABLE_ASCII = ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~'; + +describe('text-to-unicode regexes', () => { + // eslint-disable-next-line prefer-regex-literals + const skipAsciiJs = new RegExp(String.raw`([[ -~]--['"\\]]+)`, 'gv'); + // eslint-disable-next-line prefer-regex-literals + const skipAsciiHtml = new RegExp(String.raw`([[ -~]--[<>&'"]]+)`, 'gv'); + + it('regexes are equivalent to `v`-flag versions', () => { + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicodeSets + // regexes in `text-to-unicode.service.ts` can be replaced with `v`-flag versions once unicodeSets reaches + // sufficient in-browser support + expect(ALL_PRINTABLE_ASCII.match(skipAsciiJs)).toStrictEqual(ALL_PRINTABLE_ASCII.match(SKIP_ASCII_JS)); + expect(ALL_PRINTABLE_ASCII.match(skipAsciiHtml)).toStrictEqual(ALL_PRINTABLE_ASCII.match(SKIP_ASCII_HTML)); + }); +}); + +describe('text-to-unicode', () => { + interface TestConfig { + text: string + results: Record + skipAscii?: boolean + }; + const tests: TestConfig[] = [ + { + text: 'ABC', + results: { + fullUnicode: String.raw`\u0041\u0042\u0043`, + utf16: String.raw`\u0041\u0042\u0043`, + hexEntities: String.raw`ABC`, + decimalEntities: String.raw`ABC`, + }, + }, + { + text: 'ABC', + skipAscii: true, + results: { + fullUnicode: 'ABC', + utf16: 'ABC', + hexEntities: 'ABC', + decimalEntities: 'ABC', + }, + }, + { + text: ALL_PRINTABLE_ASCII, + skipAscii: true, + results: { + // eslint-disable-next-line unicorn/escape-case + fullUnicode: String.raw` !\u0022#$%&\u0027()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005c]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`, + // eslint-disable-next-line unicorn/escape-case + utf16: String.raw` !\u0022#$%&\u0027()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005c]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`, + hexEntities: String.raw` !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`, + decimalEntities: String.raw` !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`, + }, + }, + { + text: '文字', + results: { + // eslint-disable-next-line unicorn/escape-case + fullUnicode: String.raw`\u6587\u5b57`, + // eslint-disable-next-line unicorn/escape-case + utf16: String.raw`\u6587\u5b57`, + hexEntities: String.raw`文字`, + decimalEntities: String.raw`文字`, + }, + }, + { + text: 'a 💩 b', + skipAscii: true, + results: { + // eslint-disable-next-line unicorn/escape-case + fullUnicode: String.raw`a \u{1f4a9} b`, + // eslint-disable-next-line unicorn/escape-case + utf16: String.raw`a \ud83d\udca9 b`, + hexEntities: String.raw`a 💩 b`, + decimalEntities: String.raw`a 💩 b`, + }, + }, + ]; + + for (const { text, skipAscii, results } of tests) { + describe(`${text} (skipAscii=${skipAscii})`, () => { + for (const [key, result] of Object.entries(results)) { + describe(key, () => { + const converter = converters[key as ConverterId]; + it('Escaping', () => { + expect(converter.escape(text, skipAscii ?? false)).toBe(result); + }); + it('Unescaping', () => { + expect(converter.unescape(result)).toBe(text); + }); + }); + } + }); + } +}); diff --git a/src/tools/text-to-unicode/text-to-unicode.service.ts b/src/tools/text-to-unicode/text-to-unicode.service.ts index e7772cf8..407bb6f9 100644 --- a/src/tools/text-to-unicode/text-to-unicode.service.ts +++ b/src/tools/text-to-unicode/text-to-unicode.service.ts @@ -1,9 +1,86 @@ -function convertTextToUnicode(text: string): string { - return text.split('').map(value => `&#${value.charCodeAt(0)};`).join(''); +// regex that never matches +const SKIP_NOTHING_RE = /(\b\B)/; +export const SKIP_ASCII_JS = /([ -!#-&(-\[\]-~]+)/g; +export const SKIP_ASCII_HTML = /([ -!#-%(-;=?-~]+)/g; + +function codeUnits(text: string): number[] { + return text.split('').map(char => char.codePointAt(0)!); } -function convertUnicodeToText(unicodeStr: string): string { - return unicodeStr.replace(/&#(\d+);/g, (match, dec) => String.fromCharCode(dec)); +function codePoints(text: string): number[] { + return [...text].map(char => char.codePointAt(0)!); } -export { convertTextToUnicode, convertUnicodeToText }; +interface ConverterConfig { + name: string + escape: { + charValues?(text: string): number[] + mapper(charValue: number): string + /** @prop regular expression for default content to skip. Must have exactly 1 capture group. */ + asciiSkipper: RegExp + } + unescape: { + regex: RegExp + radix: number + } +} +class Converter { + constructor(public config: ConverterConfig) {} + + escape(text: string, skipAscii: boolean): string { + const { asciiSkipper, charValues, mapper } = this.config.escape; + const getCharValues = charValues ?? codePoints; + + return text + .split(skipAscii ? asciiSkipper : SKIP_NOTHING_RE) + .flatMap((x, i) => i % 2 ? x : getCharValues(x).map(mapper)) + .join(''); + } + + unescape(escaped: string): string { + const { regex, radix } = this.config.unescape; + + return escaped.replace(regex, (match) => { + return String.fromCodePoint(Number.parseInt(match.replace(/\P{AHex}/gu, ''), radix)); + }); + } +} + +export type ConverterId = keyof typeof converters; +const converters = { + fullUnicode: new Converter({ + name: 'Full Unicode', + escape: { mapper: convertCodePointToUnicode, asciiSkipper: SKIP_ASCII_JS }, + unescape: { regex: /\\u\p{AHex}{4}|\\u\{\p{AHex}{1,6}\}/gu, radix: 16 }, + }), + utf16: new Converter({ + name: 'UTF-16 Code Units', + escape: { charValues: codeUnits, mapper: convertCodePointToUnicode, asciiSkipper: SKIP_ASCII_JS }, + unescape: { regex: /\\u\p{AHex}{4}/gu, radix: 16 }, + }), + hexEntities: new Converter({ + name: 'HTML Entities (Hex)', + escape: { mapper: toHexEntities, asciiSkipper: SKIP_ASCII_HTML }, + unescape: { regex: /&#x\p{AHex}{1,6};/gu, radix: 16 }, + }), + decimalEntities: new Converter({ + name: 'HTML Entities (Decimal)', + escape: { mapper: toDecimalEntities, asciiSkipper: SKIP_ASCII_HTML }, + unescape: { regex: /&#\d+;/gu, radix: 10 }, + }), +} satisfies Record; + +function convertCodePointToUnicode(codePoint: number): string { + const hex = codePoint.toString(16); + return hex.length > 4 ? String.raw`\u{${hex}}` : String.raw`\u${hex.padStart(4, '0')}`; +} + +function toHexEntities(codePoint: number): string { + return `&#x${codePoint.toString(16)};`; +} + +function toDecimalEntities(codePoint: number): string { + return `&#${codePoint};`; +} + +export { converters }; diff --git a/src/tools/text-to-unicode/text-to-unicode.vue b/src/tools/text-to-unicode/text-to-unicode.vue index be9bed86..8dea882e 100644 --- a/src/tools/text-to-unicode/text-to-unicode.vue +++ b/src/tools/text-to-unicode/text-to-unicode.vue @@ -1,34 +1,106 @@ + +