Merge pull request #630 from MShwed/feature/mime-rfc2047

Feature: MIME RFC2047 Decoding
2025-07-01 18:32:26 -04:00 · 2025-02-16 12:11:02 +00:00 · 2025-02-16 12:11:02 +00:00 · 7906f9d560
commit 7906f9d560
parent 1cfbc2babb 2ae923b73e
4 changed files with 263 additions and 1 deletions
--- a/src/core/config/Categories.json
+++ b/src/core/config/Categories.json
@ -76,7 +76,8 @@
            "Rison Encode",
            "Rison Decode",
            "To Modhex",
-            "From Modhex"
+            "From Modhex",
+            "MIME Decoding"
        ]
    },
    {
--- a/src/core/operations/MIMEDecoding.mjs
+++ b/src/core/operations/MIMEDecoding.mjs
@ -0,0 +1,171 @@
+/**
+ * @author mshwed [m@ttshwed.com]
+ * @copyright Crown Copyright 2019
+ * @license Apache-2.0
+ */
+
+import Operation from "../Operation.mjs";
+import OperationError from "../errors/OperationError.mjs";
+import Utils from "../Utils.mjs";
+import { fromHex } from "../lib/Hex.mjs";
+import { fromBase64 } from "../lib/Base64.mjs";
+import cptable from "codepage";
+
+/**
+ * MIME Decoding operation
+ */
+class MIMEDecoding extends Operation {
+
+    /**
+     * MIMEDecoding constructor
+     */
+    constructor() {
+        super();
+
+        this.name = "MIME Decoding";
+        this.module = "Default";
+        this.description = "Enables the decoding of MIME message header extensions for non-ASCII text";
+        this.infoURL = "https://tools.ietf.org/html/rfc2047";
+        this.inputType = "byteArray";
+        this.outputType = "string";
+        this.args = [];
+    }
+
+    /**
+     * @param {byteArray} input
+     * @param {Object[]} args
+     * @returns {string}
+     */
+    run(input, args) {
+        const mimeEncodedText = Utils.byteArrayToUtf8(input);
+        const encodedHeaders = mimeEncodedText.replace(/\r\n/g, "\n");
+
+        const decodedHeader = this.decodeHeaders(encodedHeaders);
+
+        return decodedHeader;
+    }
+
+    /**
+     * Decode MIME header strings
+     *
+     * @param headerString
+     */
+    decodeHeaders(headerString) {
+        // No encoded words detected
+        let i = headerString.indexOf("=?");
+        if (i === -1) return headerString;
+
+        let decodedHeaders = headerString.slice(0, i);
+        let header = headerString.slice(i);
+
+        let isBetweenWords = false;
+        let start, cur, charset, encoding, j, end, text;
+        while (header.length > -1) {
+            start = header.indexOf("=?");
+            if (start === -1) break;
+            cur = start + "=?".length;
+
+            i = header.slice(cur).indexOf("?");
+            if (i === -1) break;
+
+            charset = header.slice(cur, cur + i);
+            cur += i + "?".length;
+
+            if (header.length < cur + "Q??=".length) break;
+
+            encoding = header[cur];
+            cur += 1;
+
+            if (header[cur] !== "?") break;
+
+            cur += 1;
+
+            j = header.slice(cur).indexOf("?=");
+            if (j === -1) break;
+
+            text = header.slice(cur, cur + j);
+            end = cur + j + "?=".length;
+
+            if (encoding.toLowerCase() === "b") {
+                text = fromBase64(text);
+            } else if (encoding.toLowerCase() === "q") {
+                text = this.parseQEncodedWord(text);
+            } else {
+                isBetweenWords = false;
+                decodedHeaders += header.slice(0, start + 2);
+                header = header.slice(start + 2);
+            }
+
+            if (start > 0 && (!isBetweenWords || header.slice(0, start).search(/\S/g) > -1)) {
+                decodedHeaders += header.slice(0, start);
+            }
+
+            decodedHeaders += this.convertFromCharset(charset, text);
+
+            header = header.slice(end);
+            isBetweenWords = true;
+        }
+
+        if (header.length > 0) {
+            decodedHeaders += header;
+        }
+
+        return decodedHeaders;
+    }
+
+    /**
+     * Converts decoded text for supported charsets.
+     * Supports UTF-8, US-ASCII, ISO-8859-*
+     *
+     * @param encodedWord
+     */
+    convertFromCharset(charset, encodedText) {
+        charset = charset.toLowerCase();
+        const parsedCharset = charset.split("-");
+
+        if (parsedCharset.length === 2 && parsedCharset[0] === "utf" && charset === "utf-8") {
+            return cptable.utils.decode(65001, encodedText);
+        } else if (parsedCharset.length === 2 && charset === "us-ascii") {
+            return cptable.utils.decode(20127, encodedText);
+        } else if (parsedCharset.length === 3 && parsedCharset[0] === "iso" && parsedCharset[1] === "8859") {
+            const isoCharset = parseInt(parsedCharset[2], 10);
+            if (isoCharset >= 1 && isoCharset <= 16) {
+                return cptable.utils.decode(28590 + isoCharset, encodedText);
+            }
+        }
+
+        throw new OperationError("Unhandled Charset");
+    }
+
+    /**
+     * Parses a Q encoded word
+     *
+     * @param encodedWord
+     */
+    parseQEncodedWord(encodedWord) {
+        let decodedWord = "";
+        for (let i = 0; i < encodedWord.length; i++) {
+            if (encodedWord[i] === "_") {
+                decodedWord += " ";
+            // Parse hex encoding
+            } else if (encodedWord[i] === "=") {
+                if ((i + 2) >= encodedWord.length) throw new OperationError("Incorrectly Encoded Word");
+                const decodedHex = Utils.byteArrayToChars(fromHex(encodedWord.substring(i + 1, i + 3)));
+                decodedWord += decodedHex;
+                i += 2;
+            } else if (
+                (encodedWord[i].charCodeAt(0) >= " ".charCodeAt(0) && encodedWord[i].charCodeAt(0) <= "~".charCodeAt(0)) ||
+                encodedWord[i] === "\n" ||
+                encodedWord[i] === "\r" ||
+                encodedWord[i] === "\t") {
+                decodedWord += encodedWord[i];
+            } else {
+                throw new OperationError("Incorrectly Encoded Word");
+            }
+        }
+
+        return decodedWord;
+    }
+}
+
+export default MIMEDecoding;
--- a/tests/operations/index.mjs
+++ b/tests/operations/index.mjs
@ -104,6 +104,7 @@ import "./tests/LZNT1Decompress.mjs";
 import "./tests/LZString.mjs";
 import "./tests/Magic.mjs";
 import "./tests/Media.mjs";
+import "./tests/MIMEDecoding.mjs";
 import "./tests/Modhex.mjs";
 import "./tests/MorseCode.mjs";
 import "./tests/MS.mjs";
--- a/tests/operations/tests/MIMEDecoding.mjs
+++ b/tests/operations/tests/MIMEDecoding.mjs
@ -0,0 +1,89 @@
+/**
+ * MIME Header Decoding tests
+ *
+ * @author mshwed [m@ttshwed.com]
+ * @copyright Crown Copyright 2019
+ * @license Apache-2.0
+ */
+
+import TestRegister from "../../lib/TestRegister.mjs";
+
+TestRegister.addTests([
+    {
+        name: "Encoded comments",
+        input: "(=?ISO-8859-1?Q?a?=)",
+        expectedOutput: "(a)",
+        recipeConfig: [
+            {
+                "op": "MIME Decoding",
+                "args": []
+            }
+        ]
+    },
+    {
+        name: "Encoded adjacent comments whitespace",
+        input: "(=?ISO-8859-1?Q?a?= b)",
+        expectedOutput: "(a b)",
+        recipeConfig: [
+            {
+                "op": "MIME Decoding",
+                "args": []
+            }
+        ]
+    },
+    {
+        name: "Encoded adjacent single whitespace ignored",
+        input: "(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)",
+        expectedOutput: "(ab)",
+        recipeConfig: [
+            {
+                "op": "MIME Decoding",
+                "args": []
+            }
+        ]
+    },
+    {
+        name: "Encoded adjacent double whitespace ignored",
+        input: "(=?ISO-8859-1?Q?a?=  =?ISO-8859-1?Q?b?=)",
+        expectedOutput: "(ab)",
+        recipeConfig: [
+            {
+                "op": "MIME Decoding",
+                "args": []
+            }
+        ]
+    },
+    {
+        name: "Encoded adjacent CRLF whitespace ignored",
+        input: "(=?ISO-8859-1?Q?a?=\r\n =?ISO-8859-1?Q?b?=)",
+        expectedOutput: "(ab)",
+        recipeConfig: [
+            {
+                "op": "MIME Decoding",
+                "args": []
+            }
+        ]
+    },
+    {
+        name: "UTF-8 Encodings Multiple Headers",
+        input: "=?utf-8?q?=C3=89ric?= <eric@example.org>, =?utf-8?q?Ana=C3=AFs?= <anais@example.org>",
+        expectedOutput: "Éric <eric@example.org>, Anaïs <anais@example.org>",
+        recipeConfig: [
+            {
+                "op": "MIME Decoding",
+                "args": []
+            }
+        ]
+    },
+    {
+        name: "ISO Decoding",
+        input: "From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>\nTo: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>\nCC: =?ISO-8859-1?Q?Andr=E9?= Pirard <PIRARD@vm1.ulg.ac.be>\nSubject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=\n=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=",
+        expectedOutput: "From: Keith Moore <moore@cs.utk.edu>\nTo: Keld Jørn Simonsen <keld@dkuug.dk>\nCC: André Pirard <PIRARD@vm1.ulg.ac.be>\nSubject: If you can read this you understand the example.",
+        recipeConfig: [
+            {
+                "op": "MIME Decoding",
+                "args": []
+            }
+        ]
+    }
+]);