From b0ae8d7b3dda98d056a0814051ff5ebf45388f16 Mon Sep 17 00:00:00 2001 From: lionel-rowe Date: Tue, 14 May 2024 21:45:31 +0800 Subject: [PATCH] fix(text-to-unicode): handle non-BMP + more conversion options --- .../text-to-unicode.service.test.ts | 75 +++++++++++- .../text-to-unicode.service.ts | 96 ++++++++++++++- src/tools/text-to-unicode/text-to-unicode.vue | 114 ++++++++++++++---- 3 files changed, 257 insertions(+), 28 deletions(-) diff --git a/src/tools/text-to-unicode/text-to-unicode.service.test.ts b/src/tools/text-to-unicode/text-to-unicode.service.test.ts index bda4fa7a..6b81a7c7 100644 --- a/src/tools/text-to-unicode/text-to-unicode.service.test.ts +++ b/src/tools/text-to-unicode/text-to-unicode.service.test.ts @@ -1,7 +1,10 @@ import { describe, expect, it } from 'vitest'; -import { convertTextToUnicode, convertUnicodeToText } from './text-to-unicode.service'; +import { type ConverterId, SKIP_PRINTABLE_ASCII_RE, converters } from './text-to-unicode.service'; + +describe('text-to-unicode (legacy tests)', () => { + const convertTextToUnicode = converters.decimalEntities.escape; + const convertUnicodeToText = converters.decimalEntities.unescape; -describe('text-to-unicode', () => { describe('convertTextToUnicode', () => { it('a text string is converted to unicode representation', () => { expect(convertTextToUnicode('A')).toBe('A'); @@ -18,3 +21,71 @@ describe('text-to-unicode', () => { }); }); }); + +describe('text-to-unicode', () => { + interface TestConfig { + text: string + results: Record + skipPrintableAscii?: boolean + }; + const tests: TestConfig[] = [ + { + text: 'ABC', + results: { + fullUnicode: String.raw`\u0041\u0042\u0043`, + utf16: String.raw`\u0041\u0042\u0043`, + hexEntities: String.raw`ABC`, + decimalEntities: String.raw`ABC`, + }, + }, + { + text: 'ABC', + skipPrintableAscii: true, + results: { + fullUnicode: 'ABC', + utf16: 'ABC', + hexEntities: 'ABC', + decimalEntities: 'ABC', + }, + }, + { + text: '文字', + results: { + // eslint-disable-next-line unicorn/escape-case + fullUnicode: String.raw`\u6587\u5b57`, + // eslint-disable-next-line unicorn/escape-case + utf16: String.raw`\u6587\u5b57`, + hexEntities: String.raw`文字`, + decimalEntities: String.raw`文字`, + }, + }, + { + text: 'a 💩 b', + skipPrintableAscii: true, + results: { + // eslint-disable-next-line unicorn/escape-case + fullUnicode: String.raw`a \u{1f4a9} b`, + // eslint-disable-next-line unicorn/escape-case + utf16: String.raw`a \ud83d\udca9 b`, + hexEntities: String.raw`a 💩 b`, + decimalEntities: String.raw`a 💩 b`, + }, + }, + ]; + + for (const { text, skipPrintableAscii: skipAscii, results } of tests) { + describe(`${text} (skipAscii=${skipAscii})`, () => { + for (const [key, result] of Object.entries(results)) { + describe(key, () => { + const converter = converters[key as ConverterId]; + it('Escaping', () => { + expect(converter.escape(text, skipAscii ? SKIP_PRINTABLE_ASCII_RE : undefined)).toBe(result); + }); + it('Unescaping', () => { + expect(converter.unescape(result)).toBe(text); + }); + }); + } + }); + } +}); diff --git a/src/tools/text-to-unicode/text-to-unicode.service.ts b/src/tools/text-to-unicode/text-to-unicode.service.ts index e7772cf8..c5cadf88 100644 --- a/src/tools/text-to-unicode/text-to-unicode.service.ts +++ b/src/tools/text-to-unicode/text-to-unicode.service.ts @@ -1,9 +1,95 @@ -function convertTextToUnicode(text: string): string { - return text.split('').map(value => `&#${value.charCodeAt(0)};`).join(''); +// regex that never matches +const SKIP_NOTHING_RE = /(\b\B)/; +export const SKIP_PRINTABLE_ASCII_RE = /([ -~]+)/g; + +function _codeUnits(text: string): number[] { + return text.split('').map(char => char.codePointAt(0)); } -function convertUnicodeToText(unicodeStr: string): string { - return unicodeStr.replace(/&#(\d+);/g, (match, dec) => String.fromCharCode(dec)); +function _codePoints(text: string): number[] { + return [...text].map(char => char.codePointAt(0)); } -export { convertTextToUnicode, convertUnicodeToText }; +export interface Converter { + name: string + escape(text: string, skip: RegExp): string + unescape(text: string): string +}; + +interface EscapeConfig { + getCharValues?(text: string): number[] + mapper(charValue: number): string +}; + +function escaper({ getCharValues, mapper }: EscapeConfig) { + /** + * @param text text input to escape + * @param skipper regular expression for content _not_ to escape. Must have exactly 1 capture group. + */ + return (text: string, skipper?: RegExp): string => { + skipper ??= SKIP_NOTHING_RE; + getCharValues ??= _codePoints; + + return text + .split(skipper) + .flatMap((x, i) => { + if (i % 2) { + return x; + } + return getCharValues(x).map(mapper); + }) + .join(''); + }; +} + +interface UnescapeConfig { + regex: RegExp + radix: number +}; + +function unescaper({ regex, radix }: UnescapeConfig) { + return (escaped: string): string => { + return escaped.replace(regex, (match) => { + return String.fromCodePoint(Number.parseInt(match.replace(/\P{AHex}/gu, ''), radix)); + }); + }; +} + +export type ConverterId = keyof typeof converters; +const converters = { + fullUnicode: { + name: 'Full Unicode', + escape: escaper({ mapper: convertCodePointToUnicode }), + unescape: unescaper({ regex: /\\u\p{AHex}{4}|\\u\{\p{AHex}{1,6}\}/gu, radix: 16 }), + }, + utf16: { + name: 'UTF-16 Code Units', + escape: escaper({ getCharValues: _codeUnits, mapper: convertCodePointToUnicode }), + unescape: unescaper({ regex: /\\u\p{AHex}{4}/gu, radix: 16 }), + }, + hexEntities: { + name: 'HTML Entities (Hex)', + escape: escaper({ mapper: toHexEntities }), + unescape: unescaper({ regex: /&#x\p{AHex}{1,6};/gu, radix: 16 }), + }, + decimalEntities: { + name: 'HTML Entities (Decimal)', + escape: escaper({ mapper: toDecimalEntities }), + unescape: unescaper({ regex: /&#\d+;/gu, radix: 10 }), + }, +} satisfies Record; + +function convertCodePointToUnicode(codePoint: number): string { + const hex = codePoint.toString(16); + return hex.length > 4 ? String.raw`\u{${hex}}` : String.raw`\u${hex.padStart(4, '0')}`; +} + +function toHexEntities(codePoint: number): string { + return `&#x${codePoint.toString(16)};`; +} + +function toDecimalEntities(codePoint: number): string { + return `&#${codePoint};`; +} + +export { converters }; diff --git a/src/tools/text-to-unicode/text-to-unicode.vue b/src/tools/text-to-unicode/text-to-unicode.vue index be9bed86..e4c58f19 100644 --- a/src/tools/text-to-unicode/text-to-unicode.vue +++ b/src/tools/text-to-unicode/text-to-unicode.vue @@ -1,34 +1,106 @@ + +