Added parsing of headers.

2025-06-14 18:25:16 -04:00 · 2019-09-03 20:33:57 -04:00 · 2019-09-03 20:33:57 -04:00 · 7f97afd3e0
commit 7f97afd3e0
parent c3994aa8e3
3 changed files with 172 additions and 78 deletions
--- a/src/core/operations/MIMEDecoding.mjs
+++ b/src/core/operations/MIMEDecoding.mjs
@ -9,6 +9,7 @@ import OperationError from "../errors/OperationError";
 import Utils from "../Utils";
 import { fromHex } from "../lib/Hex.mjs";
 import { fromBase64 } from "../lib/Base64";
+import cptable from "../vendor/js-codepage/cptable.js";

 /**
 * MIME Decoding operation
@ -23,24 +24,11 @@ class MIMEDecoding extends Operation {

        this.name = "MIME Decoding";
        this.module = "Default";
-        this.description = "";
-        this.infoURL = "";
+        this.description = "Enables the decoding of MIME message header extensions for non-ASCII text";
+        this.infoURL = "https://tools.ietf.org/html/rfc2047";
        this.inputType = "byteArray";
        this.outputType = "string";
-        this.args = [
-            /* Example arguments. See the project wiki for full details.
-            {
-                name: "First arg",
-                type: "string",
-                value: "Don't Panic"
-            },
-            {
-                name: "Second arg",
-                type: "number",
-                value: 42
-            }
-            */
-        ];
+        this.args = [];
    }

    /**
@ -49,73 +37,135 @@ class MIMEDecoding extends Operation {
     * @returns {string}
     */
    run(input, args) {
+        const mimeEncodedText = Utils.byteArrayToUtf8(input);
+        const encodedHeaders = mimeEncodedText.replace(/\r\n/g, "\n");

-        let mimeEncodedText = Utils.byteArrayToUtf8(input)
+        const decodedHeader = this.decodeHeaders(encodedHeaders);

-        let parsedString = "";
-        let currentPos = 0;
-        let pastPosition = 0;
-        while (currentPos >= 0) {
-            
-            // Find starting text
-            currentPos = mimeEncodedText.indexOf("=?", pastPosition);
-            console.log('CURRENT POSITION', currentPos);
-            if (currentPos < 0) break;
-
-            // Add existing unparsed string
-            let fillerText = mimeEncodedText.substring(pastPosition, currentPos);
-            console.log("PROCESSING RANGE", pastPosition, ' ' ,currentPos)
-            console.log('FILLER TEXT: ', fillerText);
-            if (fillerText.indexOf('\r') > 0) console.log('CR detected', fillerText.indexOf('\r'));
-            if (fillerText.indexOf('\n') > 0) console.log('LF detected', fillerText.indexOf('\n'));
-            if (fillerText.indexOf('\r\n') > 0) console.log('CRLF detected', fillerText.indexOf('\r\n'));
-            if (fillerText.indexOf('\x20') > 0) console.log('SPACE detected', fillerText.indexOf('\x20'));
-            if (fillerText.indexOf('\n\x20') > 0) console.log('newline SPACE detected', fillerText.indexOf('\x20'));
-
-            if (fillerText !== '\r\n')
-                parsedString += fillerText
-
-            pastPosition = currentPos;
-
-            // find ending text
-            currentPos = mimeEncodedText.indexOf("?=", pastPosition);
-
-            // Process block
-            let encodedTextBlock = mimeEncodedText.substring(pastPosition + 2, currentPos);
-            pastPosition = currentPos + 2;
-
-            parsedString += this.parseEncodedWord(encodedTextBlock);
+        return decodedHeader;
    }

-        return parsedString;
+    /**
+     * Decode MIME header strings
+     * 
+     * @param headerString
+     */
+    decodeHeaders(headerString) {
+        // No encoded words detected
+        let i = headerString.indexOf("=?");
+        if (i === -1) return headerString;

-        throw new OperationError("Test");
-    }
+        let decodedHeaders = headerString.slice(0, i);
+        let header = headerString.slice(i);

-    parseEncodedWord(encodedWord) {
-        let [charset, encoding, encodedBlock] = encodedWord.split('?');
+        let isBetweenWords = false;
+        let start, cur, charset, encoding, j, end, text;
+        while (header.length > -1) {
+            start = header.indexOf("=?");
+            if (start === -1) break;
+            cur = start + "=?".length;

-        console.log('CURRENT BLOCK TO PROCESS', encodedBlock);
-        console.log('CURRENT CHARSET', charset);
+            i = header.slice(cur).indexOf("?");
+            if (i === -1) break;

-        let encodedText = '';
-        if (encoding.toLowerCase() === 'b') {
-            encodedText = fromBase64(encodedBlock);
+            charset = header.slice(cur, cur + i);
+            cur += i + "?".length;
+
+            if (header.length < cur + "Q??=".length) break;
+
+            encoding = header[cur];
+            cur += 1;
+
+            if (header[cur] !== "?") break;
+
+            cur += 1;
+
+            j = header.slice(cur).indexOf("?=");
+            if (j === -1) break;
+
+            text = header.slice(cur, cur + j);
+            end = cur + j + "?=".length;
+
+            if (encoding.toLowerCase() === "b") {
+                text = fromBase64(text);
+            } else if (encoding.toLowerCase() === "q") {
+                text = this.parseQEncodedWord(text);
            } else {
-            encodedText = encodedBlock;
-            let encodedChars = encodedText.indexOf("=");
-            if (encodedChars > 0) {
-                let extractedHex = encodedText.substring(encodedChars + 1, encodedChars + 3);
-                console.log("EXTRACTED HEX", extractedHex)
-                encodedText = encodedText.replace(`=${extractedHex}`, Utils.byteArrayToChars(fromHex(`=${extractedHex}`)))
+                isBetweenWords = false;
+                decodedHeaders += header.slice(0, start + 2);
+                header = header.slice(start + 2);
            }

-            encodedText = encodedText.replace("_", " ");
+            if (start > 0 && (!isBetweenWords || header.slice(0, start).search(/\S/g) > -1)) {
+                decodedHeaders += header.slice(0, start);
            }

-        return encodedText;
+            decodedHeaders += this.convertFromCharset(charset, text);
+
+            header = header.slice(end);
+            isBetweenWords = true;
        }

+        if (header.length > 0) {
+            decodedHeaders += header;
+        }
+
+        return decodedHeaders;
+    }
+
+    /**
+     * Converts decoded text for supported charsets.
+     * Supports UTF-8, US-ASCII, ISO-8859-*
+     *
+     * @param encodedWord
+     */
+    convertFromCharset(charset, encodedText) {
+        charset = charset.toLowerCase();
+        const parsedCharset = charset.split("-");
+
+        if (parsedCharset.length === 2 && parsedCharset[0] === "utf" && charset === "utf-8") {
+            return cptable.utils.decode(65001, encodedText);
+        } else if (parsedCharset.length === 2 && charset === "us-ascii") {
+            return cptable.utils.decode(20127, encodedText);
+        } else if (parsedCharset.length === 3 && parsedCharset[0] === "iso" && parsedCharset[1] === "8859") {
+            const isoCharset = parseInt(parsedCharset[2], 10);
+            if (isoCharset >= 1 && isoCharset <= 16) {
+                return cptable.utils.decode(28590 + isoCharset, encodedText);
+            }
+        }
+
+        throw new OperationError("Unhandled Charset");
+    }
+
+    /**
+     * Parses a Q encoded word
+     *
+     * @param encodedWord
+     */
+    parseQEncodedWord(encodedWord) {
+        let decodedWord = "";
+        for (let i = 0; i < encodedWord.length; i++) {
+            if (encodedWord[i] === "_") {
+                decodedWord += " ";
+            // Parse hex encoding
+            } else if (encodedWord[i] === "=") {
+                if ((i + 2) >= encodedWord.length) throw new OperationError("Incorrectly Encoded Word");
+                const decodedHex = Utils.byteArrayToChars(fromHex(encodedWord.substring(i + 1, i + 3)));
+                decodedWord += decodedHex;
+                i += 2;
+            } else if (
+                (encodedWord[i].charCodeAt(0) >= " ".charCodeAt(0) && encodedWord[i].charCodeAt(0) <= "~".charCodeAt(0)) ||
+                encodedWord[i] === "\n" ||
+                encodedWord[i] === "\r" ||
+                encodedWord[i] === "\t") {
+                decodedWord += encodedWord[i];
+            } else {
+                throw new OperationError("Incorrectly Encoded Word");
+            }
+        }
+
+        return decodedWord;
+    }
 }

 export default MIMEDecoding;
--- a/tests/operations/index.mjs
+++ b/tests/operations/index.mjs
@ -11,10 +11,7 @@
 * @license Apache-2.0
 */

-import {
-    setLongTestFailure,
-    logTestReport,
-} from "../lib/utils.mjs";
+import { setLongTestFailure, logTestReport } from "../lib/utils.mjs";

 import TestRegister from "../lib/TestRegister.mjs";
 import "./tests/AESKeyWrap.mjs";
@ -104,6 +101,7 @@ import "./tests/LZNT1Decompress.mjs";
 import "./tests/LZString.mjs";
 import "./tests/Magic.mjs";
 import "./tests/Media.mjs";
+import "./tests/MIMEDecoding";
 import "./tests/Modhex.mjs";
 import "./tests/MorseCode.mjs";
 import "./tests/MS.mjs";
@ -167,7 +165,7 @@ const testStatus = {
    allTestsPassing: true,
    counts: {
        total: 0,
-    }
+    },
 };

 setLongTestFailure();
--- a/tests/operations/tests/MIMEDecoding.mjs
+++ b/tests/operations/tests/MIMEDecoding.mjs
@ -0,0 +1,46 @@
+/**
+ * MIME Header Decoding tests
+ * 
+ * @author mshwed [m@ttshwed.com]
+ * @copyright Crown Copyright 2019
+ * @license Apache-2.0
+ */
+
+import TestRegister from "../../lib/TestRegister.mjs";
+
+TestRegister.addTests([
+    {
+        name: "Encoded =?",
+        input: "=?=?utf-8?q?test?=",
+        expectedOutput: "=?test",
+        recipeConfig: [
+            {
+                "op": "MIME Decoding",
+                "args": []
+            }
+        ]
+    },
+    {
+        name: "UTF-8 Encodings Multiple Headers",
+        input: "=?utf-8?q?=C3=89ric?= <eric@example.org>, =?utf-8?q?Ana=C3=AFs?= <anais@example.org>",
+        expectedOutput: "Éric <eric@example.org>, Anaïs <anais@example.org>",
+        recipeConfig: [
+            {
+                "op": "MIME Decoding",
+                "args": []
+            }
+        ]
+    },
+    {
+        name: "UTF-8 Encodings Single Header",
+        input: "=?utf-8?q?=C2=A1Hola,?= =?utf-8?q?_se=C3=B1or!?=",
+        expectedOutput: "¡Hola, señor!",
+        recipeConfig: [
+            {
+                "op": "MIME Decoding",
+                "args": []
+            }
+        ]
+    },
+
+]);