Always escape ASCII chars with special meaning

This commit is contained in:
lionel-rowe 2024-05-15 10:53:37 +08:00
parent b0ae8d7b3d
commit 1dc965d9a8
No known key found for this signature in database
4 changed files with 57 additions and 28 deletions

View file

@ -10,16 +10,18 @@ test.describe('Tool - Text to Unicode', () => {
}); });
test('Text to unicode conversion', async ({ page }) => { test('Text to unicode conversion', async ({ page }) => {
await page.getByTestId('text-to-unicode-input').fill('it-tools'); await page.getByTestId('text-to-unicode-input').fill('"it-tools" 文字');
const unicode = await page.getByTestId('text-to-unicode-output').inputValue(); const unicode = await page.getByTestId('text-to-unicode-output').inputValue();
expect(unicode).toEqual('it-tools'); // eslint-disable-next-line unicorn/escape-case
expect(unicode).toEqual(String.raw`\u0022it-tools\u0022 \u6587\u5b57`);
}); });
test('Unicode to text conversion', async ({ page }) => { test('Unicode to text conversion', async ({ page }) => {
await page.getByTestId('unicode-to-text-input').fill('it-tools'); // eslint-disable-next-line unicorn/escape-case
await page.getByTestId('unicode-to-text-input').fill(String.raw`\u0022it-tools\u0022 \u6587\u5b57`);
const text = await page.getByTestId('unicode-to-text-output').inputValue(); const text = await page.getByTestId('unicode-to-text-output').inputValue();
expect(text).toEqual('it-tools'); expect(text).toEqual('"it-tools" 文字');
}); });
}); });

View file

@ -1,8 +1,8 @@
import { describe, expect, it } from 'vitest'; import { describe, expect, it } from 'vitest';
import { type ConverterId, SKIP_PRINTABLE_ASCII_RE, converters } from './text-to-unicode.service'; import { type ConverterId, SKIP_ASCII_HTML, SKIP_ASCII_JS, converters } from './text-to-unicode.service';
describe('text-to-unicode (legacy tests)', () => { describe('text-to-unicode (legacy tests)', () => {
const convertTextToUnicode = converters.decimalEntities.escape; const convertTextToUnicode = (text: string) => converters.decimalEntities.escape(text, false);
const convertUnicodeToText = converters.decimalEntities.unescape; const convertUnicodeToText = converters.decimalEntities.unescape;
describe('convertTextToUnicode', () => { describe('convertTextToUnicode', () => {
@ -22,6 +22,23 @@ describe('text-to-unicode (legacy tests)', () => {
}); });
}); });
const ALL_PRINTABLE_ASCII = ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~';
describe('text-to-unicode regexes', () => {
// eslint-disable-next-line prefer-regex-literals
const skipAsciiJs = new RegExp(String.raw`([[ -~]--['"\\]]+)`, 'gv');
// eslint-disable-next-line prefer-regex-literals
const skipAsciiHtml = new RegExp(String.raw`([[ -~]--[<>&'"]]+)`, 'gv');
it('regexes are equivalent to `v`-flag versions', () => {
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicodeSets
// regexes in `text-to-unicode.service.ts` can be replaced with `v`-flag versions once unicodeSets reaches
// sufficient in-browser support
expect(ALL_PRINTABLE_ASCII.match(skipAsciiJs)).toStrictEqual(ALL_PRINTABLE_ASCII.match(SKIP_ASCII_JS));
expect(ALL_PRINTABLE_ASCII.match(skipAsciiHtml)).toStrictEqual(ALL_PRINTABLE_ASCII.match(SKIP_ASCII_HTML));
});
});
describe('text-to-unicode', () => { describe('text-to-unicode', () => {
interface TestConfig { interface TestConfig {
text: string text: string
@ -48,6 +65,18 @@ describe('text-to-unicode', () => {
decimalEntities: 'ABC', decimalEntities: 'ABC',
}, },
}, },
{
text: ALL_PRINTABLE_ASCII,
skipPrintableAscii: true,
results: {
// eslint-disable-next-line unicorn/escape-case
fullUnicode: String.raw` !\u0022#$%&\u0027()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005c]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`,
// eslint-disable-next-line unicorn/escape-case
utf16: String.raw` !\u0022#$%&\u0027()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005c]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`,
hexEntities: String.raw` !&#x22;#$%&#x26;&#x27;()*+,-./0123456789:;&#x3c;=&#x3e;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`,
decimalEntities: String.raw` !&#34;#$%&#38;&#39;()*+,-./0123456789:;&#60;=&#62;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`,
},
},
{ {
text: '文字', text: '文字',
results: { results: {
@ -79,7 +108,7 @@ describe('text-to-unicode', () => {
describe(key, () => { describe(key, () => {
const converter = converters[key as ConverterId]; const converter = converters[key as ConverterId];
it('Escaping', () => { it('Escaping', () => {
expect(converter.escape(text, skipAscii ? SKIP_PRINTABLE_ASCII_RE : undefined)).toBe(result); expect(converter.escape(text, skipAscii)).toBe(result);
}); });
it('Unescaping', () => { it('Unescaping', () => {
expect(converter.unescape(result)).toBe(text); expect(converter.unescape(result)).toBe(text);

View file

@ -1,37 +1,35 @@
// regex that never matches // regex that never matches
const SKIP_NOTHING_RE = /(\b\B)/; const SKIP_NOTHING_RE = /(\b\B)/;
export const SKIP_PRINTABLE_ASCII_RE = /([ -~]+)/g; export const SKIP_ASCII_JS = /([ -!#-&(-\[\]-~]+)/g;
export const SKIP_ASCII_HTML = /([ -!#-%(-;=?-~]+)/g;
function _codeUnits(text: string): number[] { function codeUnits(text: string): number[] {
return text.split('').map(char => char.codePointAt(0)); return text.split('').map(char => char.codePointAt(0));
} }
function _codePoints(text: string): number[] { function codePoints(text: string): number[] {
return [...text].map(char => char.codePointAt(0)); return [...text].map(char => char.codePointAt(0));
} }
export interface Converter { export interface Converter {
name: string name: string
escape(text: string, skip: RegExp): string escape(text: string, skipAscii: boolean): string
unescape(text: string): string unescape(text: string): string
}; };
interface EscapeConfig { interface EscapeConfig {
getCharValues?(text: string): number[] charValues?(text: string): number[]
mapper(charValue: number): string mapper(charValue: number): string
/** @prop regular expression for default content to skip. Must have exactly 1 capture group. */
asciiSkipper: RegExp
}; };
function escaper({ getCharValues, mapper }: EscapeConfig) { function escaper({ charValues: getCharValues, mapper, asciiSkipper: skipper }: EscapeConfig) {
/** return (text: string, skip: boolean): string => {
* @param text text input to escape getCharValues ??= codePoints;
* @param skipper regular expression for content _not_ to escape. Must have exactly 1 capture group.
*/
return (text: string, skipper?: RegExp): string => {
skipper ??= SKIP_NOTHING_RE;
getCharValues ??= _codePoints;
return text return text
.split(skipper) .split(skip ? skipper : SKIP_NOTHING_RE)
.flatMap((x, i) => { .flatMap((x, i) => {
if (i % 2) { if (i % 2) {
return x; return x;
@ -59,22 +57,22 @@ export type ConverterId = keyof typeof converters;
const converters = { const converters = {
fullUnicode: { fullUnicode: {
name: 'Full Unicode', name: 'Full Unicode',
escape: escaper({ mapper: convertCodePointToUnicode }), escape: escaper({ mapper: convertCodePointToUnicode, asciiSkipper: SKIP_ASCII_JS }),
unescape: unescaper({ regex: /\\u\p{AHex}{4}|\\u\{\p{AHex}{1,6}\}/gu, radix: 16 }), unescape: unescaper({ regex: /\\u\p{AHex}{4}|\\u\{\p{AHex}{1,6}\}/gu, radix: 16 }),
}, },
utf16: { utf16: {
name: 'UTF-16 Code Units', name: 'UTF-16 Code Units',
escape: escaper({ getCharValues: _codeUnits, mapper: convertCodePointToUnicode }), escape: escaper({ charValues: codeUnits, mapper: convertCodePointToUnicode, asciiSkipper: SKIP_ASCII_JS }),
unescape: unescaper({ regex: /\\u\p{AHex}{4}/gu, radix: 16 }), unescape: unescaper({ regex: /\\u\p{AHex}{4}/gu, radix: 16 }),
}, },
hexEntities: { hexEntities: {
name: 'HTML Entities (Hex)', name: 'HTML Entities (Hex)',
escape: escaper({ mapper: toHexEntities }), escape: escaper({ mapper: toHexEntities, asciiSkipper: SKIP_ASCII_HTML }),
unescape: unescaper({ regex: /&#x\p{AHex}{1,6};/gu, radix: 16 }), unescape: unescaper({ regex: /&#x\p{AHex}{1,6};/gu, radix: 16 }),
}, },
decimalEntities: { decimalEntities: {
name: 'HTML Entities (Decimal)', name: 'HTML Entities (Decimal)',
escape: escaper({ mapper: toDecimalEntities }), escape: escaper({ mapper: toDecimalEntities, asciiSkipper: SKIP_ASCII_HTML }),
unescape: unescaper({ regex: /&#\d+;/gu, radix: 10 }), unescape: unescaper({ regex: /&#\d+;/gu, radix: 10 }),
}, },
} satisfies Record<string, Converter>; } satisfies Record<string, Converter>;

View file

@ -1,5 +1,5 @@
<script setup lang="ts"> <script setup lang="ts">
import { type ConverterId, SKIP_PRINTABLE_ASCII_RE, converters } from './text-to-unicode.service'; import { type ConverterId, converters } from './text-to-unicode.service';
import { useCopy } from '@/composable/copy'; import { useCopy } from '@/composable/copy';
const converterId = ref<ConverterId>('fullUnicode'); const converterId = ref<ConverterId>('fullUnicode');
@ -9,7 +9,7 @@ const inputText = ref('');
const unicodeFromText = computed(() => const unicodeFromText = computed(() =>
inputText.value.trim() === '' inputText.value.trim() === ''
? '' ? ''
: converters[converterId.value].escape(inputText.value, skipAscii.value ? SKIP_PRINTABLE_ASCII_RE : undefined), : converters[converterId.value].escape(inputText.value, skipAscii.value),
); );
const { copy: copyUnicode } = useCopy({ source: unicodeFromText }); const { copy: copyUnicode } = useCopy({ source: unicodeFromText });
@ -52,7 +52,7 @@ const { copy: copyText } = useCopy({ source: textFromUnicode });
test-id="text-to-unicode-output" test-id="text-to-unicode-output"
/> />
<div mt-2 flex justify-start> <div mt-2 flex justify-start>
<n-form-item label="Skip ASCII?" :show-feedback="false" label-placement="left"> <n-form-item label="Skip ASCII chars with no special meaning?" :show-feedback="false" label-placement="left">
<n-switch v-model:value="skipAscii" /> <n-switch v-model:value="skipAscii" />
</n-form-item> </n-form-item>
</div> </div>