Always escape ASCII chars with special meaning

2025-07-02 19:02:24 -04:00 · 2024-05-15 10:53:37 +08:00 · 2024-05-15 10:53:37 +08:00 · 1dc965d9a8
commit 1dc965d9a8
parent b0ae8d7b3d
4 changed files with 57 additions and 28 deletions
--- a/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts
+++ b/src/tools/text-to-unicode/text-to-unicode.e2e.spec.ts
@ -10,16 +10,18 @@ test.describe('Tool - Text to Unicode', () => {
  });

  test('Text to unicode conversion', async ({ page }) => {
-    await page.getByTestId('text-to-unicode-input').fill('it-tools');
+    await page.getByTestId('text-to-unicode-input').fill('"it-tools" 文字');
    const unicode = await page.getByTestId('text-to-unicode-output').inputValue();

-    expect(unicode).toEqual('&#105;&#116;&#45;&#116;&#111;&#111;&#108;&#115;');
+    // eslint-disable-next-line unicorn/escape-case
+    expect(unicode).toEqual(String.raw`\u0022it-tools\u0022 \u6587\u5b57`);
  });

  test('Unicode to text conversion', async ({ page }) => {
-    await page.getByTestId('unicode-to-text-input').fill('&#105;&#116;&#45;&#116;&#111;&#111;&#108;&#115;');
+    // eslint-disable-next-line unicorn/escape-case
+    await page.getByTestId('unicode-to-text-input').fill(String.raw`\u0022it-tools\u0022 \u6587\u5b57`);
    const text = await page.getByTestId('unicode-to-text-output').inputValue();

-    expect(text).toEqual('it-tools');
+    expect(text).toEqual('"it-tools" 文字');
  });
 });
--- a/src/tools/text-to-unicode/text-to-unicode.service.test.ts
+++ b/src/tools/text-to-unicode/text-to-unicode.service.test.ts
@ -1,8 +1,8 @@
 import { describe, expect, it } from 'vitest';
-import { type ConverterId, SKIP_PRINTABLE_ASCII_RE, converters } from './text-to-unicode.service';
+import { type ConverterId, SKIP_ASCII_HTML, SKIP_ASCII_JS, converters } from './text-to-unicode.service';

 describe('text-to-unicode (legacy tests)', () => {
-  const convertTextToUnicode = converters.decimalEntities.escape;
+  const convertTextToUnicode = (text: string) => converters.decimalEntities.escape(text, false);
  const convertUnicodeToText = converters.decimalEntities.unescape;

  describe('convertTextToUnicode', () => {
@ -22,6 +22,23 @@ describe('text-to-unicode (legacy tests)', () => {
  });
 });

+const ALL_PRINTABLE_ASCII = ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~';
+
+describe('text-to-unicode regexes', () => {
+  // eslint-disable-next-line prefer-regex-literals
+  const skipAsciiJs = new RegExp(String.raw`([[ -~]--['"\\]]+)`, 'gv');
+  // eslint-disable-next-line prefer-regex-literals
+  const skipAsciiHtml = new RegExp(String.raw`([[ -~]--[<>&'"]]+)`, 'gv');
+
+  it('regexes are equivalent to `v`-flag versions', () => {
+    // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicodeSets
+    // regexes in `text-to-unicode.service.ts` can be replaced with `v`-flag versions once unicodeSets reaches
+    // sufficient in-browser support
+    expect(ALL_PRINTABLE_ASCII.match(skipAsciiJs)).toStrictEqual(ALL_PRINTABLE_ASCII.match(SKIP_ASCII_JS));
+    expect(ALL_PRINTABLE_ASCII.match(skipAsciiHtml)).toStrictEqual(ALL_PRINTABLE_ASCII.match(SKIP_ASCII_HTML));
+  });
+});
+
 describe('text-to-unicode', () => {
  interface TestConfig {
    text: string
@ -48,6 +65,18 @@ describe('text-to-unicode', () => {
        decimalEntities: 'ABC',
      },
    },
+    {
+      text: ALL_PRINTABLE_ASCII,
+      skipPrintableAscii: true,
+      results: {
+        // eslint-disable-next-line unicorn/escape-case
+        fullUnicode: String.raw` !\u0022#$%&\u0027()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005c]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`,
+        // eslint-disable-next-line unicorn/escape-case
+        utf16: String.raw` !\u0022#$%&\u0027()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005c]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`,
+        hexEntities: String.raw` !&#x22;#$%&#x26;&#x27;()*+,-./0123456789:;&#x3c;=&#x3e;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`,
+        decimalEntities: String.raw` !&#34;#$%&#38;&#39;()*+,-./0123456789:;&#60;=&#62;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_${'`'}abcdefghijklmnopqrstuvwxyz{|}~`,
+      },
+    },
    {
      text: '文字',
      results: {
@ -79,7 +108,7 @@ describe('text-to-unicode', () => {
        describe(key, () => {
          const converter = converters[key as ConverterId];
          it('Escaping', () => {
-            expect(converter.escape(text, skipAscii ? SKIP_PRINTABLE_ASCII_RE : undefined)).toBe(result);
+            expect(converter.escape(text, skipAscii)).toBe(result);
          });
          it('Unescaping', () => {
            expect(converter.unescape(result)).toBe(text);
--- a/src/tools/text-to-unicode/text-to-unicode.service.ts
+++ b/src/tools/text-to-unicode/text-to-unicode.service.ts
@ -1,37 +1,35 @@
 // regex that never matches
 const SKIP_NOTHING_RE = /(\b\B)/;
-export const SKIP_PRINTABLE_ASCII_RE = /([ -~]+)/g;
+export const SKIP_ASCII_JS = /([ -!#-&(-\[\]-~]+)/g;
+export const SKIP_ASCII_HTML = /([ -!#-%(-;=?-~]+)/g;

-function _codeUnits(text: string): number[] {
+function codeUnits(text: string): number[] {
  return text.split('').map(char => char.codePointAt(0));
 }

-function _codePoints(text: string): number[] {
+function codePoints(text: string): number[] {
  return [...text].map(char => char.codePointAt(0));
 }

 export interface Converter {
  name: string
-  escape(text: string, skip: RegExp): string
+  escape(text: string, skipAscii: boolean): string
  unescape(text: string): string
 };

 interface EscapeConfig {
-  getCharValues?(text: string): number[]
+  charValues?(text: string): number[]
  mapper(charValue: number): string
+  /** @prop regular expression for default content to skip. Must have exactly 1 capture group. */
+  asciiSkipper: RegExp
 };

-function escaper({ getCharValues, mapper }: EscapeConfig) {
-  /**
-   * @param text text input to escape
-   * @param skipper regular expression for content _not_ to escape. Must have exactly 1 capture group.
-   */
-  return (text: string, skipper?: RegExp): string => {
-    skipper ??= SKIP_NOTHING_RE;
-    getCharValues ??= _codePoints;
+function escaper({ charValues: getCharValues, mapper, asciiSkipper: skipper }: EscapeConfig) {
+  return (text: string, skip: boolean): string => {
+    getCharValues ??= codePoints;

    return text
-      .split(skipper)
+      .split(skip ? skipper : SKIP_NOTHING_RE)
      .flatMap((x, i) => {
        if (i % 2) {
          return x;
@ -59,22 +57,22 @@ export type ConverterId = keyof typeof converters;
 const converters = {
  fullUnicode: {
    name: 'Full Unicode',
-    escape: escaper({ mapper: convertCodePointToUnicode }),
+    escape: escaper({ mapper: convertCodePointToUnicode, asciiSkipper: SKIP_ASCII_JS }),
    unescape: unescaper({ regex: /\\u\p{AHex}{4}|\\u\{\p{AHex}{1,6}\}/gu, radix: 16 }),
  },
  utf16: {
    name: 'UTF-16 Code Units',
-    escape: escaper({ getCharValues: _codeUnits, mapper: convertCodePointToUnicode }),
+    escape: escaper({ charValues: codeUnits, mapper: convertCodePointToUnicode, asciiSkipper: SKIP_ASCII_JS }),
    unescape: unescaper({ regex: /\\u\p{AHex}{4}/gu, radix: 16 }),
  },
  hexEntities: {
    name: 'HTML Entities (Hex)',
-    escape: escaper({ mapper: toHexEntities }),
+    escape: escaper({ mapper: toHexEntities, asciiSkipper: SKIP_ASCII_HTML }),
    unescape: unescaper({ regex: /&#x\p{AHex}{1,6};/gu, radix: 16 }),
  },
  decimalEntities: {
    name: 'HTML Entities (Decimal)',
-    escape: escaper({ mapper: toDecimalEntities }),
+    escape: escaper({ mapper: toDecimalEntities, asciiSkipper: SKIP_ASCII_HTML }),
    unescape: unescaper({ regex: /&#\d+;/gu, radix: 10 }),
  },
 } satisfies Record<string, Converter>;
--- a/src/tools/text-to-unicode/text-to-unicode.vue
+++ b/src/tools/text-to-unicode/text-to-unicode.vue
@ -1,5 +1,5 @@
 <script setup lang="ts">
-import { type ConverterId, SKIP_PRINTABLE_ASCII_RE, converters } from './text-to-unicode.service';
+import { type ConverterId, converters } from './text-to-unicode.service';
 import { useCopy } from '@/composable/copy';

 const converterId = ref<ConverterId>('fullUnicode');
@ -9,7 +9,7 @@ const inputText = ref('');
 const unicodeFromText = computed(() =>
  inputText.value.trim() === ''
    ? ''
-    : converters[converterId.value].escape(inputText.value, skipAscii.value ? SKIP_PRINTABLE_ASCII_RE : undefined),
+    : converters[converterId.value].escape(inputText.value, skipAscii.value),
 );
 const { copy: copyUnicode } = useCopy({ source: unicodeFromText });

@ -52,7 +52,7 @@ const { copy: copyText } = useCopy({ source: textFromUnicode });
        test-id="text-to-unicode-output"
      />
      <div mt-2 flex justify-start>
-        <n-form-item label="Skip ASCII?" :show-feedback="false" label-placement="left">
+        <n-form-item label="Skip ASCII chars with no special meaning?" :show-feedback="false" label-placement="left">
          <n-switch v-model:value="skipAscii" />
        </n-form-item>
      </div>