working on moving parsing to lib

2025-07-06 12:52:34 -04:00 · 2018-11-27 14:49:02 -05:00 · 2018-11-27 14:49:02 -05:00 · fa5d2b130f
commit fa5d2b130f
parent a23c94cd76
2 changed files with 316 additions and 23 deletions
--- a/src/core/lib/Mime.mjs
+++ b/src/core/lib/Mime.mjs
@ -0,0 +1,292 @@
+/**
+ * @author bwhitn [brian.m.whitney@outlook.com]
+ * @copyright Crown Copyright 2016
+ * @license Apache-2.0
+ */
+
+import OperationError from "../errors/OperationError";
+import cptable from "../vendor/js-codepage/cptable.js";
+import {fromBase64} from "../lib/Base64";
+import {decodeQuotedPrintable} from "../lib/QuotedPrintable";
+import {MIME_FORMAT} from "../lib/ChrEnc";
+import Utils from "../Utils";
+
+/**
+ *
+ *
+ * @constant
+ * @default
+ */
+const BODY_FILE_TYPE = {
+    "text/plain": "txt",
+    "text/html": "htm",
+    "application/rtf": "rtf",
+}
+
+class Mime {
+    /**
+     * Internet MessageFormat constructor
+     */
+    constructor(input) {
+        this.input = input;
+        this.rn = input.indexOf("\r") >= 0;
+    }
+
+    /**
+     * Basic Email Parser that displays the header and mime sections as files.
+     * Args 0 boolean decode quoted words
+     *
+     * @param {string} input
+     * @param {boolean} decodeWords
+     * @returns {File[]}
+     */
+     // NOTE: Liberties taken include:
+     // No checks are made to verify quoted words are valid encodings e.g. underscore vs escape
+     // This attempts to decode mime reguardless if it is \r\n (correct newline) or \n (incorrect)
+     // Both Base64 and QuotedPrintable is used for decode. UUEncode is not available right now
+     // and is a standardized encoding format.
+    decodeMime(decodeWords) {
+        // TODO Later: no uuencode function. See if we can fix this.
+        // TODO: content-type can be omitted and would mean us-ascii charset and text/plain.
+        if (!this.input) {
+            return [];
+        }
+        let emlObj = Mime._splitParse(this.input);
+        if (!emlObj.body) { throw new OperationError("No body was found");}
+        if (decodeWords) {
+            emlObj.rawHeader = Mime.replaceEncodedWord(emlObj.rawHeader);
+        }
+        let retval = [new File([emlObj.rawHeader], "Header", {type: "text/plain"})];
+        let retfiles = this._walkMime(emlObj);
+        retfiles.forEach(function(fileObj){
+            let file = null;
+            if (fileObj.name !== null) {
+                file = new File([fileObj.data], fileObj.name, {type: fileObj.type});
+            } else {
+                let name = null;
+                if ("subject" in emlObj.header) {
+                    name = emlObj.header["subject"][0].concat(".");
+                } else {
+                    name = "Undefined.";
+                }
+                if (fileObj.type in BODY_FILE_TYPE) {
+                    name = name.concat(BODY_FILE_TYPE[fileObj.type]);
+                } else {
+                    name = name.concat("bin");
+                }
+                file = new File([fileObj.data], name, {type: fileObj.type});
+            }
+            retval.push(file);
+        });
+        return retval;
+    }
+
+    /**
+     * Walks a MIME document and returns an array of Mime data and header objects.
+     *
+     * @param {string} input
+     * @param {object} header
+     * @returns {object[]}
+     */
+    _walkMime(parentObj) {
+        let new_line_length = this.rn ? 2 : 1;
+        let contType = null, fileName = null, charEnc = null, contDispoObj = null;
+        if (parentObj.header.hasOwnProperty("content-type")) {
+            let contTypeObj = Mime._decodeComplexField(parentObj.header["content-type"][0]);
+            if (parentObj.header.hasOwnProperty("content-disposition")) {
+                contDispoObj = Mime._decodeComplexField(parentObj.header["content-disposition"][0])
+                if (contDispoObj != null && contDispoObj.hasOwnProperty("filename")) {
+                    fileName = contDispoObj.filename;
+                }
+            }
+            if (contTypeObj != null) {
+                if (contTypeObj.hasOwnProperty("value")) {
+                    contType = contTypeObj.value[0];
+                }
+                if (contTypeObj.hasOwnProperty("charset")) {
+                    charEnc = contTypeObj.charset;
+                }
+                if (fileName == null && contTypeObj.hasOwnProperty("name")) {
+                    fileName = contTypeObj.name;
+                }
+            }
+            if (contType.startsWith("multipart/")) {
+                let content_boundary = null;
+                let output_sections = [];
+                if (contTypeObj.hasOwnProperty("boundary")) {
+                    content_boundary = contTypeObj.boundary;
+                }
+                let mime_parts = Mime._splitMultipart(parentObj.body, content_boundary, new_line_length);
+                mime_parts.forEach(function(mime_part){
+                    let mimeObj = Mime._splitParse(mime_part);
+                    if (!mimeObj.body) {
+                        return [];
+                    }
+                    let parts = this._walkMime(mimeObj);
+                    parts.forEach(function(part){
+                        output_sections.push(part);
+                    }, this);
+                }, this);
+                return output_sections;
+            }
+            if (parentObj.header.hasOwnProperty("content-transfer-encoding")) {
+                let contEncObj = Mime._decodeComplexField(parentObj.header["content-transfer-encoding"][0]);
+                let contTran = null;
+                if (contEncObj != null && contEncObj.hasOwnProperty("value")) {
+                        contTran = contEncObj.value[0];
+                }
+                if (contTran != null) {
+                    parentObj.body = Mime._decodeMimeData(parentObj.body, charEnc, contTran);
+                }
+            }
+            return [{type: contType, data: parentObj.body, name: fileName}];
+        }
+        throw new OperationError("Invalid Mime section");
+     }
+
+    /**
+     * Takes a string and decodes quoted words inside them
+     * These take the form of =?utf-8?Q?Hello?=
+     *
+     * @param {string} input
+     * @returns {string}
+     */
+    static replaceEncodedWord(input) {
+        return input.replace(/=\?([^?]+)\?(Q|B)\?([^?]+)\?=/g, function (a, charEnc, contEnc, input) {
+            contEnc = (contEnc === "B") ? "base64" : "quoted-printable";
+            if (contEnc === "quoted-printable") {
+                input = input.replace(/_/g, " ");
+            }
+            return Mime._decodeMimeData(input, charEnc, contEnc);
+        });
+    }
+
+
+    /**
+     * Breaks the header from the body and parses the header. The returns an
+     * object or null. The object contains the raw header, decoded body, and
+     * parsed header object.
+     *
+     * @param {string} input
+     * @returns {object}
+     */
+    static _splitParse(input) {
+        const emlRegex = /(?:\r?\n){2}/g;
+        let matchobj = emlRegex.exec(input);
+        if (matchobj) {
+            let splitEmail = [input.substring(0,matchobj.index), input.substring(emlRegex.lastIndex)];
+            const sectionRegex = /([A-Za-z-]+):\s+([\x00-\xff]+?)(?=$|\r?\n\S)/g;
+            let headerObj = {}, section;
+            while ((section = sectionRegex.exec(splitEmail[0]))) {
+                let fieldName = section[1].toLowerCase();
+                let fieldValue = Mime.replaceEncodedWord(section[2].replace(/\n|\r/g, " "));
+                if (fieldName in headerObj) {
+                    headerObj[fieldName].push(fieldValue);
+                } else {
+                    headerObj[fieldName] = [fieldValue];
+                }
+            }
+            return {rawHeader:splitEmail[0], body: splitEmail[1], header: headerObj};
+        }
+        return null;
+    }
+
+    /**
+     * Return decoded MIME data given the character encoding and content encoding.
+     *
+     * @param {string} input
+     * @param {string} charEnc
+     * @param {string} contEnc
+     * @returns {string}
+     */
+    static _decodeMimeData(input, charEnc, contEnc) {
+        switch (contEnc) {
+            case "base64":
+                input = fromBase64(input);
+                break;
+            case "quoted-printable":
+                input = Utils.byteArrayToUtf8(decodeQuotedPrintable(input));
+                break;
+        }
+        if (charEnc && MIME_FORMAT.hasOwnProperty(charEnc.toLowerCase())) {
+            input = cptable.utils.decode(MIME_FORMAT[charEnc.toLowerCase()], input);
+        }
+        return input;
+    }
+
+    /**
+     * Parses a complex header field and returns an object that contains
+     * normalized keys with corresponding values along with single values under
+     * a value array.
+     *
+     * @param {string} field
+     * @returns {object}
+     */
+    static _decodeComplexField(field) {
+        let fieldSplit = field.split(/;\s+/g);
+        let retVal = {};
+        fieldSplit.forEach(function(item){
+            if (item.indexOf("=") >= 0) {
+                let eq = item.indexOf("=");
+                let kv = null;
+                if (item.length > eq) {
+                    kv = [item.substring(0, eq), item.substring(eq + 1).trim()];
+                } else {
+                    throw OperationError("Not a valid header entry");
+                }
+                if ((kv[1].startsWith("\'") && kv[1].endsWith("\'"))
+                    || (kv[1].startsWith("\"") && kv[1].endsWith("\""))) {
+                    kv[1] = (/(['"])(.+)\1/.exec(kv[1]))[2];
+                }
+                retVal[kv[0].toLowerCase()] = kv[1];
+            } else {
+                item = item.trim().toLowerCase();
+                if (retVal.hasOwnProperty("value")) {
+                    retVal.value.push(item);
+                } else {
+                    retVal.value = [item];
+                }
+            }
+        });
+        return retVal;
+    }
+
+    /**
+     * Splits a Mime document by the current boundaries and attempts to account
+     * for the current new line size which can be either the standard \r\n or \n.
+     *
+     * @param {string} input
+     * @param {string} boundary
+     * @param {string} new_line_length
+     * @return {string[]}
+     */
+    static _splitMultipart(input, boundary, new_line_length) {
+        let output = [];
+        let newline = new_line_length === 2 ? "\r\n" : "\n";
+        const boundary_str = "--".concat(boundary, newline);
+        let last = input.indexOf("--".concat(boundary, "--", newline)) - new_line_length;
+        if (last < 0) {
+            last = input.indexOf("--".concat(boundary, "--")) - new_line_length;
+        }
+        let start = 0;
+        while(true) {
+            let start = input.indexOf(boundary_str, start);
+            if (start >= 0) {
+                start = start + boundary_str.length;
+            } else {
+                break;
+            }
+            let end = input.indexOf(boundary_str, start) - new_line_length;
+            if (end > start) {
+                output.push(input.substring(start, end));
+            } else {
+                output.push(input.substring(start, last));
+                break;
+            }
+            start = end;
+        }
+        return output;
+    }
+}
+
+export default Mime;
--- a/src/core/operations/ParseIMF.mjs
+++ b/src/core/operations/ParseIMF.mjs
@ -10,21 +10,20 @@ import cptable from "../vendor/js-codepage/cptable.js";
 import {fromBase64} from "../lib/Base64";
 import {decodeQuotedPrintable} from "../lib/QuotedPrintable";
 import {MIME_FORMAT} from "../lib/ChrEnc";
+import Mime from "../lib/Mime";
 import Utils from "../Utils";

 /**
- * Return the conetent encoding for a mime section from a header object.
- * CONTENT_TYPE returns the content type of a mime header from a header object.
- * Returns the filename from a mime header object.
- * Returns the boundary value for the mime section from a header object.
+ *
+ *
 * @constant
 * @default
- */
+ *
 const BODY_FILE_TYPE = {
    "text/plain": "txt",
    "text/html": "htm",
    "application/rtf": "rtf",
-}
+} */

 class ParseIMF extends Operation {

@ -52,6 +51,11 @@ class ParseIMF extends Operation {
        ];
    }

+    run(input, args) {
+        let mimeObj = new Mime(input);
+        return mimeObj.decodeMime(args[0]);
+    }
+
    /**
     * Basic Email Parser that displays the header and mime sections as files.
     * Args 0 boolean decode quoted words
@ -59,7 +63,7 @@ class ParseIMF extends Operation {
     * @param {string} input
     * @param {Object[]} args
     * @returns {File[]}
-     */
+     *
     // NOTE: Liberties taken include:
     // No checks are made to verify quoted words are valid encodings e.g. underscore vs escape
     // This attempts to decode mime reguardless if it is \r\n (correct newline) or \n (incorrect)
@ -99,7 +103,7 @@ class ParseIMF extends Operation {
            retval.push(file);
        });
        return retval;
-    }
+    } */

    /**
     * Displays the files in HTML for web apps.
@ -117,7 +121,7 @@ class ParseIMF extends Operation {
     * @param {string} input
     * @param {object} header
     * @returns {object[]}
-     */
+     *
    static walkMime(parentObj, rn) {
        let new_line_length = rn ? 2 : 1;
        let contType = null, fileName = null, charEnc = null, contDispoObj = null;
@ -180,7 +184,7 @@ class ParseIMF extends Operation {
     *
     * @param {string} input
     * @returns {string}
-     */
+     *
    static replaceDecodeWord(input) {
        return input.replace(/=\?([^?]+)\?(Q|B)\?([^?]+)\?=/g, function (a, charEnc, contEnc, input) {
            contEnc = (contEnc === "B") ? "base64" : "quoted-printable";
@ -199,7 +203,7 @@ class ParseIMF extends Operation {
     *
     * @param {string} input
     * @returns {object}
-     */
+     *
    static splitParse(input) {
        const emlRegex = /(?:\r?\n){2}/g;
        let matchobj = emlRegex.exec(input);
@ -228,7 +232,7 @@ class ParseIMF extends Operation {
     * @param {string} charEnc
     * @param {string} contEnc
     * @returns {string}
-     */
+     *
    static decodeMimeData(input, charEnc, contEnc) {
        switch (contEnc) {
            case "base64":
@ -237,10 +241,6 @@ class ParseIMF extends Operation {
            case "quoted-printable":
                input = Utils.byteArrayToUtf8(decodeQuotedPrintable(input));
                break;
-            case "7bit":
-            case "8bit":
-            default:
-                break;
        }
        if (charEnc && MIME_FORMAT.hasOwnProperty(charEnc.toLowerCase())) {
            input = cptable.utils.decode(MIME_FORMAT[charEnc.toLowerCase()], input);
@ -249,12 +249,13 @@ class ParseIMF extends Operation {
    }

    /**
-     * Parse a complex header field and return an object that contains normalized
-     * keys with corresponding values and single values under a value array.
+     * Parses a complex header field and returns an object that contains
+     * normalized keys with corresponding values along with single values under
+     * a value array.
     *
     * @param {string} field
     * @returns {object}
-     */
+     *
    static decodeComplexField(field) {
        let fieldSplit = field.split(/;\s+/g);
        let retVal = {};
@ -285,14 +286,14 @@ class ParseIMF extends Operation {
    }

    /**
-     * Splits a Mime document by the current boundaries and try to account for
-     * the current new line size which can be either the standard \r\n or \n.
+     * Splits a Mime document by the current boundaries and attempts to account
+     * for the current new line size which can be either the standard \r\n or \n.
     *
     * @param {string} input
     * @param {string} boundary
     * @param {string} new_line_length
     * @return {string[]}
-     */
+     *
    static splitMultipart(input, boundary, new_line_length) {
        let output = [];
        let newline = new_line_length === 2 ? "\r\n" : "\n";
@ -319,7 +320,7 @@ class ParseIMF extends Operation {
            start = end;
        }
        return output;
-    }
+    } */
 }

 export default ParseIMF