diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index 8235ab10b4..0b78a5bf21 100755 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -59,7 +59,9 @@ "From Braille", "Parse TLV", "CSV to JSON", - "JSON to CSV" + "JSON to CSV", + "Parse Mime", + "Decode Mime Encoded Words" ] }, { diff --git a/src/core/lib/ChrEnc.mjs b/src/core/lib/ChrEnc.mjs index 02b2e9a2b8..e4fa04d192 100644 --- a/src/core/lib/ChrEnc.mjs +++ b/src/core/lib/ChrEnc.mjs @@ -56,3 +56,50 @@ export const IO_FORMAT = { "Simplified Chinese GB18030 (54936)": 54936, }; +/** + * Preferred MIME encoding format mappings. + */ +export const MIME_FORMAT = { + "utf-8": 65001, + "utf-7": 65000, + "unicode": 1200, + "ibm500": 500, + "ebcdic-cp-us": 37, + "windows-874": 874, + "shift_jis": 932, + "gbk": 936, + "gb2312": 936, + "ks_c_5601-1987": 949, + "big5": 950, + "windows-1250": 1250, + "windows-1251": 1251, + "windows-1252": 1252, + "windows-1253": 1253, + "windows-1254": 1254, + "windows-1255": 1255, + "windows-1256": 1256, + "windows-1257": 1257, + "windows-1258": 1258, + "us-ascii": 20127, + "koi8-r": 20866, + "koi8-u": 21866, + "iso-8859-1": 28591, + "iso-8859-2": 28592, + "iso-8859-3": 28593, + "iso-8859-4": 28594, + "iso-8859-5": 28595, + "iso-8859-6": 28596, + "iso-8859-7": 28597, + "iso-8859-8": 28598, + "iso-8859-9": 28599, + "iso-8859-10": 28600, + "iso-8859-11": 28601, + "iso-8859-13": 28603, + "iso-8859-14": 28604, + "iso-8859-15": 28605, + "iso-8859-16": 28606, + "iso-2022": 50222, + "x-euc": 51932, + "euc-kr": 51949, + "gb18030": 54936, +}; diff --git a/src/core/lib/Mime.mjs b/src/core/lib/Mime.mjs new file mode 100644 index 0000000000..56785b3732 --- /dev/null +++ b/src/core/lib/Mime.mjs @@ -0,0 +1,310 @@ +import OperationError from "../errors/OperationError"; +import cptable from "../vendor/js-codepage/cptable.js"; +import {decodeQuotedPrintable} from "../lib/QuotedPrintable"; +import {MIME_FORMAT} from "../lib/ChrEnc"; +import Utils from "../Utils"; + +/** + * Class to do general Mime format parsing + * + * @author bwhitn [brian.m.whitney@outlook.com] + * @copyright Crown Copyright 2016 + * @license Apache-2.0 + */ +class Mime { + /** + * Mime Constructor + */ + constructor(input) { + this.mimeObj = Mime._parseMime(input); + } + + /** + * Extract data from mimeObjects and return object array containing them. + * extractData([["testa", "header", "subheader"], ["testb", "header"]]) would + * returns an array of objects {fields: {testa: "somestringornull", testb: "somestringornull"}, header: "somestringornull", body: "somestringornull"} + * + * @param {string[][]} headerObjects + * @param {boolean} header + * @param {boolean} body + * @param {boolean} recursive + * @returns {object[]} + */ + extractData(headerObjects, header=true, body=true, recursive=true) { + const output = []; + Mime.walkMime(this.mimeObj, function(mimePart) { + const outObj = {}; + outObj.fields = {}; + if (body) { + const contType = Mime._extractField(mimePart, "content-type"); + if (contType && !contType.startsWith("multipart/")) { + outObj.body = mimePart.body; + } else { + outObj.body = null; + } + } + if (header) { + outObj.header = mimePart.rawHeader; + } + if (!headerObjects) { + output.push(outObj); + return; + } + if (!Array.isArray(headerObjects)) { + throw new OperationError("Invalid extraction in headers. Not an Array."); + } + headerObjects.forEach(function(obj) { + if (!Array.isArray(obj)) { + throw new OperationError("Invalid extraction in headers Object. Not an Array."); + } + switch (obj.length) { + case 2: + outObj.fields[obj[0]] = Mime._extractField(mimePart, obj[1]); + break; + case 3: + outObj.fields[obj[0]] = Mime._extractField(mimePart, obj[1], obj[2]); + break; + default: + throw new OperationError("Invalid extraction in headers. Invalid Array size."); + } + }); + output.push(outObj); + }, recursive); + return output; + } + + /** + * Common helper function to decode Mime encoded words in headers. + * + * @param {boolean} recursive + */ + decodeHeaderWords(recursive=true) { + Mime.walkMime(this.mimeObj, function(mimePart) { + if (mimePart.rawHeader) { + mimePart.rawHeader = Mime.replaceEncodedWord(mimePart.rawHeader); + } + }, recursive); + } + + /** + * Common helper function to decode Mime bodies. + * + * @param {boolean} recursive + */ + decodeMimeObjects(recursive=true) { + Mime.walkMime(this.mimeObj, function(mimePart) { + Mime.decodeMimeMessage(mimePart); + }, recursive); + } + + /** + * Walks a MIME document and returns a Mime Object. + * + * @param {string} mimeData + * @returns {object} + */ + static _parseMime(mimeData) { + const mimeObj = Mime._splitParseHead(mimeData); + const contType = Mime._extractField(mimeObj, "content-type"); + const boundary = Mime._extractField(mimeObj, "content-type", "boundary"); + if (mimeObj.body && contType && contType.startsWith("multipart/")) { + if (!boundary) { + throw new OperationError("Invalid mulitpart section no boundary"); + } + const sections = []; + for (const val of Mime._splitMultipart(mimeObj.body, boundary)) { + sections.push(Mime._parseMime(val)); + } + if (sections.length) { + mimeObj.body = sections; + } + } + return mimeObj; + } + + /** + * Executes a function on a mime object. These methods should modify the mimeObj. + * + * @param {Object} mimeObj + * @param {function} methods + * @param {boolean} recursive + */ + static walkMime(mimeObj, method, recursive=true) { + const contType = Mime._extractField(mimeObj, "content-type"); + method(mimeObj); + if (recursive && mimeObj.body && Array.isArray(mimeObj.body) && contType && contType.startsWith("multipart/")) { + mimeObj.body.forEach(function(obj) { + Mime.walkMime(obj, method); + }); + } + } + + /** + * Attempts to decode a mimeObj's data by applying appropriate character and content decoders based on the header data. + * + * @param {Object} mimeObj + */ + static decodeMimeMessage(mimeObj) { + const contType = Mime._extractField(mimeObj, "content-type"); + const contEnc = Mime._extractField(mimeObj, "content-transfer-encoding"); + let charEnc = Mime._extractField(mimeObj, "content-type", "charset"); + if (contType != null) { + if (!charEnc && contType.startsWith("text/")) { + charEnc = "us-ascii"; + } + } + if (mimeObj.body && contEnc && typeof mimeObj.body === "string") { + mimeObj.body = Mime._decodeMimeData(mimeObj.body, charEnc, contEnc); + } + } + + /** + * Takes a string and decodes quoted words inside them + * These take the form of: + * input "=?utf-8?Q?Hello_World!?=" + * output "Hello World!" + * + * @param {string} input + * @param {string} type + * @returns {string} + */ + static replaceEncodedWord(input) { + return input.replace(/=\?([^?]+)\?(Q|B)\?([^?]+)\?=/g, function (a, charEnc, contEnc, input) { + contEnc = (contEnc === "B") ? "base64" : "quoted-printable"; + if (contEnc === "quoted-printable") { + input = input.replace(/_/g, " "); + } + return Utils.byteArrayToUtf8(Mime._decodeMimeData(input, charEnc, contEnc)); + }); + } + + /** + * Breaks the header from the body and parses the header. The returns an + * object or null. The object contains the raw header, decoded body, and + * parsed header object. + * + * @param {string} input + * @returns {object} + */ + static _splitParseHead(input) { + const emlRegex = /(?:\r?\n){2}/g; + const matchObj = emlRegex.exec(input); + if (matchObj) { + const splitEmail = [input.substring(0, matchObj.index), input.substring(emlRegex.lastIndex)]; + return {rawHeader: splitEmail[0], body: splitEmail[1], header: Mime._parseHeader(splitEmail[0])}; + } + return {rawHeader: input, body: null, header: Mime._parseHeader(input)}; + } + + /** + * + * + * + */ + static _parseHeader(input) { + const sectionRegex = /([A-Za-z-]+):\s+([\x00-\xff]+?)(?=$|\r?\n\S)/g; + const headerObj = {}; + let section; + while ((section = sectionRegex.exec(input))) { + const fieldName = section[1].toLowerCase(); + const fieldValue = Mime.replaceEncodedWord(section[2].replace(/\n|\r/g, " ")); + if (fieldName in headerObj) { + headerObj[fieldName].push(fieldValue); + } else { + headerObj[fieldName] = [fieldValue]; + } + } + return headerObj; + } + + /** + * Return decoded MIME data given the character encoding and content encoding. + * + * @param {string} input + * @param {string} charEnc + * @param {string} contEnc + * @returns {string} + */ + static _decodeMimeData(input, charEnc, contEnc) { + try { + switch (contEnc) { + case "base64": + input = Utils.convertToByteArray(input, "base64"); + break; + case "quoted-printable": + input = decodeQuotedPrintable(input); + } + if (charEnc && MIME_FORMAT.hasOwnProperty(charEnc.toLowerCase())) { + input = Utils.strToByteArray(cptable.utils.decode(MIME_FORMAT[charEnc.toLowerCase()], input)); + } + return input; + } catch (err) { + throw new OperationError("Invalid Mime Format"); + } + } + + /** + * Parses a header field and returns an object that contains + * normalized keys with corresponding values along with single values under + * a value array. + * + * @param {string} field + * @returns {string} + */ + static _extractField(mimeObj, field, subfield=null) { + if (subfield) { + subfield = subfield.toLowerCase(); + } + if (mimeObj.header.hasOwnProperty(field)) { + const fieldSplit = mimeObj.header[field][0].split(/;\s+/g); + for (let i = 0; i < fieldSplit.length; i++) { + const eq = fieldSplit[i].indexOf("="); + if (eq >= 0 && fieldSplit[i].length > eq && subfield) { + const kv = [fieldSplit[i].substring(0, eq), fieldSplit[i].substring(eq + 1).trim()]; + if ((kv[1].startsWith("'") && kv[1].endsWith("'")) || (kv[1].startsWith("\"") && kv[1].endsWith("\""))) { + const val = (/(['"])(.+)\1/.exec(kv[1])); + if (val && val.length === 3) { + kv[1] = val[2]; + } + } + if (subfield === kv[0].toLowerCase()) { + return kv[1]; + } + } else if (!subfield){ + return fieldSplit[i].trim().toLowerCase(); + } + } + } + return null; + } + + /** + * Splits a Mime document by the current boundaries and attempts to account + * for the current new line size which can be either the standard \r\n or \n. + * + * @param {string} input + * @param {string} boundary + * @return {string[]} + */ + static *_splitMultipart(input, boundary) { + const newline = input.indexOf("\r") >= 0 ? "\r\n" : "\n"; + const boundaryStr = "--".concat(boundary); + const boundaryStrEnd = newline.concat(boundaryStr); + const last = input.indexOf(boundaryStrEnd.concat("--")); + let begin = 0; + for (let end = 0; end !== last; begin = end) { + begin = input.indexOf(boundaryStr, begin); + if (begin < 0) { + break; + } + begin += boundaryStr.length; + end = input.indexOf(boundaryStrEnd, begin); + if (end <= begin) { + break; + } + yield input.substring(begin, end); + } + } +} + +export default Mime; diff --git a/src/core/lib/QuotedPrintable.mjs b/src/core/lib/QuotedPrintable.mjs new file mode 100644 index 0000000000..e7f7ece1ba --- /dev/null +++ b/src/core/lib/QuotedPrintable.mjs @@ -0,0 +1,35 @@ +/** + * Some parts taken from mimelib (http://github.com/andris9/mimelib) + * @author Andris Reinman + * @license MIT + * + * @author n1474335 [n1474335@gmail.com] + * @copyright Crown Copyright 2016 + * @license Apache-2.0 + */ + +/** + * @param {string} input + * @returns {byteArray} + */ +export function decodeQuotedPrintable(input) { + const str = input.replace(/=(?:\r?\n|$)/g, ""); + + const encodedBytesCount = (str.match(/=[\da-fA-F]{2}/g) || []).length, + bufferLength = str.length - encodedBytesCount * 2, + buffer = new Array(bufferLength); + let chr, hex, + bufferPos = 0; + + for (let i = 0, len = str.length; i < len; i++) { + chr = str.charAt(i); + if (chr === "=" && (hex = str.substr(i + 1, 2)) && /[\da-fA-F]{2}/.test(hex)) { + buffer[bufferPos++] = parseInt(hex, 16); + i += 2; + continue; + } + buffer[bufferPos++] = chr.charCodeAt(0); + } + + return buffer; +} diff --git a/src/core/operations/DecodeMimeEncodedWords.mjs b/src/core/operations/DecodeMimeEncodedWords.mjs new file mode 100644 index 0000000000..9997d74043 --- /dev/null +++ b/src/core/operations/DecodeMimeEncodedWords.mjs @@ -0,0 +1,43 @@ +/** + * @author bwhitn [brian.m.whitney@outlook.com] + * @copyright Crown Copyright 2016 + * @license Apache-2.0 + */ + +import Operation from "../Operation"; +import Mime from "../lib/Mime"; + +/** + * Operation for Finding and replacing Mime encoded words. + */ +class DecodeMimeEncodedWords extends Operation { + + /** + * DecodeMimeEncodedWords constructor + */ + constructor() { + super(); + this.name = "Decode Mime Encoded Words"; + this.module = "Default"; + this.description = ["Parser an IMF formatted messages following RFC5322.", + "

", "Decodes Mime encoded words that are found in IMF messages.", + ].join("\n"); + this.infoURL = "https://tools.ietf.org/html/rfc2047"; + this.inputType = "string"; + this.outputType = "string"; + this.args = []; + } + + /** + * + * + * + * + * + */ + run(input, args) { + return Mime.replaceEncodedWord(input); + } +} + +export default DecodeMimeEncodedWords; diff --git a/src/core/operations/FromQuotedPrintable.mjs b/src/core/operations/FromQuotedPrintable.mjs index 61466e4eb7..9f24519d1e 100644 --- a/src/core/operations/FromQuotedPrintable.mjs +++ b/src/core/operations/FromQuotedPrintable.mjs @@ -9,6 +9,7 @@ */ import Operation from "../Operation"; +import {decodeQuotedPrintable} from "../lib/QuotedPrintable"; /** * From Quoted Printable operation @@ -43,25 +44,7 @@ class FromQuotedPrintable extends Operation { * @returns {byteArray} */ run(input, args) { - const str = input.replace(/=(?:\r?\n|$)/g, ""); - - const encodedBytesCount = (str.match(/=[\da-fA-F]{2}/g) || []).length, - bufferLength = str.length - encodedBytesCount * 2, - buffer = new Array(bufferLength); - let chr, hex, - bufferPos = 0; - - for (let i = 0, len = str.length; i < len; i++) { - chr = str.charAt(i); - if (chr === "=" && (hex = str.substr(i + 1, 2)) && /[\da-fA-F]{2}/.test(hex)) { - buffer[bufferPos++] = parseInt(hex, 16); - i += 2; - continue; - } - buffer[bufferPos++] = chr.charCodeAt(0); - } - - return buffer; + return decodeQuotedPrintable(input); } } diff --git a/src/core/operations/ParseMime.mjs b/src/core/operations/ParseMime.mjs new file mode 100644 index 0000000000..3d2a3c5c71 --- /dev/null +++ b/src/core/operations/ParseMime.mjs @@ -0,0 +1,117 @@ +/** + * @author bwhitn [brian.m.whitney@outlook.com] + * @copyright Crown Copyright 2016 + * @license Apache-2.0 + */ + +import Operation from "../Operation"; +import Mime from "../lib/Mime"; +import Utils from "../Utils"; + +/** + * + */ +class ParseMime extends Operation { + + /** + * ParseMime constructor + */ + constructor() { + super(); + this.name = "Parse Mime"; + this.module = "Default"; + this.description = ["Generic Mime Message parser that decodes Mime messages into files", + "

", + "The output will be the root header and the associated mime parts.", + "This includes Internet Message Format which are found in SMTP traffic." + ].join("\n"); + this.infoURL = "https://tools.ietf.org/html/rfc2045"; + this.inputType = "string"; + this.outputType = "List"; + this.presentType = "html"; + this.args = [ + { + "name": "Decode Encoded-Words", + "type": "boolean", + "value": false + } + ]; + } + + /** + * Basic Email Parser that displays the header and mime sections as files. + * Args 0 boolean decode quoted words + * + * @param {string} input + * @param {boolean} decodeWords + * @returns {File[]} + */ + run(input, args) { + const eml = new Mime(input); + if (!eml.mimeObj) { + return []; + } + eml.decodeMimeObjects(); + if (args[0]) { + eml.decodeHeaderWords(false); + } + const fields = [["filename", "content-disposition", "filename"], + ["name", "content-type", "name"], + ["type", "content-type"], + ["subject", "subject"]]; + const dataObj = eml.extractData(fields); + let subject = null; + const retval = []; + if (dataObj.length >= 1) { + subject = dataObj[0].fields.subject; + if (dataObj[0].header) { + retval.push(new File([dataObj[0].header], "Header.txt", {type: "text/plain"})); + } + } + dataObj.forEach(function(obj) { + if (obj.body) { + let name = obj.fields.filename ? obj.fields.filename : obj.fields.name; + const type = obj.fields.type ? obj.fields.type : "text/plain"; + if (!name) { + name = (subject ? subject : "Undefined").concat(ParseMime.getFileExt(type)); + } + if (Array.isArray(obj.body)) { + retval.push(new File([Uint8Array.from(obj.body)], name, {type: type})); + } else { + retval.push(new File([obj.body], name, {type: type})); + } + } + }); + return retval; + } + + /** + * Simple function to add a common file extention based on mime type string. + * + * @param {string} mimetype + * @returns {string} + */ + static getFileExt(mimetype) { + switch (mimetype) { + case "text/plain": + return ".txt"; + case "text/html": + return ".htm"; + case "application/rtf": + return ".rtf"; + } + return ".bin"; + } + + /** + * Displays the files in HTML for web apps. + * + * @param {File[]} files + * @returns {html} + */ + async present(files) { + return await Utils.displayFilesAsHTML(files); + } +} + +export default ParseMime; diff --git a/tests/operations/index.mjs b/tests/operations/index.mjs index fb68ed9ca6..ffdd2c055b 100644 --- a/tests/operations/index.mjs +++ b/tests/operations/index.mjs @@ -57,6 +57,7 @@ import "./tests/JWTSign"; import "./tests/JWTVerify"; import "./tests/MS"; import "./tests/Magic"; +import "./tests/Mime"; import "./tests/MorseCode"; import "./tests/NetBIOS"; import "./tests/OTP"; diff --git a/tests/operations/tests/Mime.mjs b/tests/operations/tests/Mime.mjs new file mode 100644 index 0000000000..05c2f5dac0 --- /dev/null +++ b/tests/operations/tests/Mime.mjs @@ -0,0 +1,22 @@ +/** + * Mime tests. + * + * @author bwhitn [brian.m.whitney@outlook.com] + * @copyright Crown Copyright 2018 + * @license Apache-2.0 + */ +import TestRegister from "../TestRegister"; + +TestRegister.addTests([ + { + name: "Decode Mime Encoded Words", + input: "This is a GBK base64 encoded word: =?GBK?B?zfjC57P4yqY=?=.\nThis is a Cyrillic UTF-8 quoted word: =?utf-8?Q?=d0=9a=d0=b8=d0=b1=d0=b5=d1=80_=d0=a8=d0=b5=d1=84?=.", + expectedOutput: "This is a GBK base64 encoded word: 网络厨师.\nThis is a Cyrillic UTF-8 quoted word: Кибер Шеф.", + recipeConfig: [ + { + op: "Decode Mime Encoded Words", + args: [] + } + ] + }, +]);