Automatically detect UTF8 character encoding in output

2025-06-17 19:55:18 -04:00 · 2024-03-26 13:33:00 +00:00 · 2024-03-26 13:33:00 +00:00 · 65ffd8d65d
commit 65ffd8d65d
parent 16dfb3fac6
7 changed files with 270 additions and 147 deletions
--- a/src/core/lib/ChrEnc.mjs
+++ b/src/core/lib/ChrEnc.mjs
@ -224,8 +224,85 @@ export function chrEncWidth(page) {
 * @copyright Crown Copyright 2019
 * @license Apache-2.0
 */
+export const UNICODE_NORMALISATION_FORMS = ["NFD", "NFC", "NFKD", "NFKC"];
+

 /**
- * Character encoding format mappings.
+ * Detects whether the input buffer is valid UTF8.
+ *
+ * @param {ArrayBuffer} data
+ * @returns {number} - 0 = not UTF8, 1 = ASCII, 2 = UTF8
 */
-export const UNICODE_NORMALISATION_FORMS = ["NFD", "NFC", "NFKD", "NFKC"];
+export function isUTF8(data) {
+    const bytes = new Uint8Array(data);
+    let i = 0;
+    let onlyASCII = true;
+    while (i < bytes.length) {
+        if (( // ASCII
+            bytes[i] === 0x09 ||
+            bytes[i] === 0x0A ||
+            bytes[i] === 0x0D ||
+            (0x20 <= bytes[i] && bytes[i] <= 0x7E)
+        )) {
+            i += 1;
+            continue;
+        }
+
+        onlyASCII = false;
+
+        if (( // non-overlong 2-byte
+            (0xC2 <= bytes[i] && bytes[i] <= 0xDF) &&
+            (0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
+        )) {
+            i += 2;
+            continue;
+        }
+
+        if (( // excluding overlongs
+            bytes[i] === 0xE0 &&
+            (0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
+            (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF)
+        ) ||
+        ( // straight 3-byte
+            ((0xE1 <= bytes[i] && bytes[i] <= 0xEC) ||
+            bytes[i] === 0xEE ||
+            bytes[i] === 0xEF) &&
+            (0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) &&
+            (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
+        ) ||
+        ( // excluding surrogates
+            bytes[i] === 0xED &&
+            (0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) &&
+            (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
+        )) {
+            i += 3;
+            continue;
+        }
+
+        if (( // planes 1-3
+            bytes[i] === 0xF0 &&
+            (0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
+            (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
+            (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
+        ) ||
+        ( // planes 4-15
+            (0xF1 <= bytes[i] && bytes[i] <= 0xF3) &&
+            (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
+            (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
+            (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
+        ) ||
+        ( // plane 16
+            bytes[i] === 0xF4 &&
+            (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) &&
+            (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
+            (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
+        )) {
+            i += 4;
+            continue;
+        }
+
+        return 0;
+    }
+
+    return onlyASCII ? 1 : 2;
+}
--- a/src/core/lib/Magic.mjs
+++ b/src/core/lib/Magic.mjs
@ -3,6 +3,7 @@ import Utils, { isWorkerEnvironment } from "../Utils.mjs";
 import Recipe from "../Recipe.mjs";
 import Dish from "../Dish.mjs";
 import {detectFileType, isType} from "./FileType.mjs";
+import {isUTF8} from "./ChrEnc.mjs";
 import chiSquared from "chi-squared";

 /**
@ -111,82 +112,6 @@ class Magic {
        };
    }

-    /**
-     * Detects whether the input buffer is valid UTF8.
-     *
-     * @returns {boolean}
-     */
-    isUTF8() {
-        const bytes = new Uint8Array(this.inputBuffer);
-        let i = 0;
-        while (i < bytes.length) {
-            if (( // ASCII
-                bytes[i] === 0x09 ||
-                bytes[i] === 0x0A ||
-                bytes[i] === 0x0D ||
-                (0x20 <= bytes[i] && bytes[i] <= 0x7E)
-            )) {
-                i += 1;
-                continue;
-            }
-
-            if (( // non-overlong 2-byte
-                (0xC2 <= bytes[i] && bytes[i] <= 0xDF) &&
-                (0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
-            )) {
-                i += 2;
-                continue;
-            }
-
-            if (( // excluding overlongs
-                bytes[i] === 0xE0 &&
-                (0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
-                (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF)
-            ) ||
-            ( // straight 3-byte
-                ((0xE1 <= bytes[i] && bytes[i] <= 0xEC) ||
-                bytes[i] === 0xEE ||
-                bytes[i] === 0xEF) &&
-                (0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) &&
-                (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
-            ) ||
-            ( // excluding surrogates
-                bytes[i] === 0xED &&
-                (0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) &&
-                (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
-            )) {
-                i += 3;
-                continue;
-            }
-
-            if (( // planes 1-3
-                bytes[i] === 0xF0 &&
-                (0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
-                (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
-                (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
-            ) ||
-            ( // planes 4-15
-                (0xF1 <= bytes[i] && bytes[i] <= 0xF3) &&
-                (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
-                (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
-                (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
-            ) ||
-            ( // plane 16
-                bytes[i] === 0xF4 &&
-                (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) &&
-                (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
-                (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
-            )) {
-                i += 4;
-                continue;
-            }
-
-            return false;
-        }
-
-        return true;
-    }
-
    /**
     * Calculates the Shannon entropy of the input data.
     *
@ -336,7 +261,7 @@ class Magic {
            data: this.inputStr.slice(0, 100),
            languageScores: this.detectLanguage(extLang),
            fileType: this.detectFileType(),
-            isUTF8: this.isUTF8(),
+            isUTF8: !!isUTF8(this.inputBuffer),
            entropy: this.calcEntropy(),
            matchingOps: matchingOps,
            useful: useful,