diff --git a/package-lock.json b/package-lock.json
index 50e7fb46..69463316 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -2165,7 +2165,6 @@
"version": "0.18.1",
"resolved": "https://registry.npmjs.org/axios/-/axios-0.18.1.tgz",
"integrity": "sha512-0BfJq4NSfQXd+SkFdrvFbG7addhYSBA2mQwISr46pD6E5iqkWg02RAs8vyTT/j0RTnoYmeXauBuSv1qKwR179g==",
- "dev": true,
"requires": {
"follow-redirects": "1.5.10",
"is-buffer": "^2.0.2"
@@ -5712,7 +5711,6 @@
"version": "1.5.10",
"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.5.10.tgz",
"integrity": "sha512-0V5l4Cizzvqt5D44aTXbFZz+FtyXV1vrDN6qrelxtfYQKW0KO0W2T/hkE8xvGa/540LkZlkaUjO4ailYTFtHVQ==",
- "dev": true,
"requires": {
"debug": "=3.1.0"
},
@@ -5721,7 +5719,6 @@
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz",
"integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==",
- "dev": true,
"requires": {
"ms": "2.0.0"
}
@@ -7464,6 +7461,11 @@
"postcss": "^7.0.14"
}
},
+ "idb-keyval": {
+ "version": "3.2.0",
+ "resolved": "https://registry.npmjs.org/idb-keyval/-/idb-keyval-3.2.0.tgz",
+ "integrity": "sha512-slx8Q6oywCCSfKgPgL0sEsXtPVnSbTLWpyiDcu6msHOyKOLari1TD1qocXVCft80umnkk3/Qqh3lwoFt8T/BPQ=="
+ },
"ieee754": {
"version": "1.1.12",
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.1.12.tgz",
@@ -7756,8 +7758,7 @@
"is-buffer": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.3.tgz",
- "integrity": "sha512-U15Q7MXTuZlrbymiz95PJpZxu8IlipAp4dtS3wOdgPXx3mqBnslrWU14kxfHB+Py/+2PVKSr37dMAgM2A4uArw==",
- "dev": true
+ "integrity": "sha512-U15Q7MXTuZlrbymiz95PJpZxu8IlipAp4dtS3wOdgPXx3mqBnslrWU14kxfHB+Py/+2PVKSr37dMAgM2A4uArw=="
},
"is-builtin-module": {
"version": "1.0.0",
@@ -7981,8 +7982,7 @@
"is-url": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz",
- "integrity": "sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==",
- "dev": true
+ "integrity": "sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww=="
},
"is-utf8": {
"version": "0.2.1",
@@ -9284,6 +9284,11 @@
"lower-case": "^1.1.1"
}
},
+ "node-fetch": {
+ "version": "2.6.0",
+ "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.0.tgz",
+ "integrity": "sha512-8dG4H5ujfvFiqDmVu9fQ5bOHUC15JMjMY/Zumv26oOvvVJjM67KF8koCWIabKQ1GJIa9r2mMZscBq/TbdOcmNA=="
+ },
"node-forge": {
"version": "0.8.5",
"resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.8.5.tgz",
@@ -9719,6 +9724,11 @@
"mimic-fn": "^1.0.0"
}
},
+ "opencollective-postinstall": {
+ "version": "2.0.2",
+ "resolved": "https://registry.npmjs.org/opencollective-postinstall/-/opencollective-postinstall-2.0.2.tgz",
+ "integrity": "sha512-pVOEP16TrAO2/fjej1IdOyupJY8KDUM1CvsaScRbw6oddvpQoOfGk4ywha0HKKVAD6RkW4x6Q+tNBwhf3Bgpuw=="
+ },
"opener": {
"version": "1.5.1",
"resolved": "https://registry.npmjs.org/opener/-/opener-1.5.1.tgz",
@@ -11247,8 +11257,7 @@
"resolve-url": {
"version": "0.2.1",
"resolved": "https://registry.npmjs.org/resolve-url/-/resolve-url-0.2.1.tgz",
- "integrity": "sha1-LGN/53yJOv0qZj/iGqkIAGjiBSo=",
- "dev": true
+ "integrity": "sha1-LGN/53yJOv0qZj/iGqkIAGjiBSo="
},
"restore-cursor": {
"version": "2.0.0",
@@ -12647,6 +12656,58 @@
}
}
},
+ "tesseract.js": {
+ "version": "2.0.0-alpha.15",
+ "resolved": "https://registry.npmjs.org/tesseract.js/-/tesseract.js-2.0.0-alpha.15.tgz",
+ "integrity": "sha512-qM1XUFVlTO+tx6oVRpd9QQ8PwQLxo3qhbfIHByUlUVIqWx6y/U9xlHIaG033/Tjfs2EQ0NAehPTOJ+eNElsXEg==",
+ "requires": {
+ "axios": "^0.18.0",
+ "check-types": "^7.4.0",
+ "is-url": "1.2.2",
+ "node-fetch": "^2.3.0",
+ "opencollective-postinstall": "^2.0.2",
+ "resolve-url": "^0.2.1",
+ "tesseract.js-core": "^2.0.0-beta.11",
+ "tesseract.js-utils": "^1.0.0-beta.8"
+ },
+ "dependencies": {
+ "check-types": {
+ "version": "7.4.0",
+ "resolved": "https://registry.npmjs.org/check-types/-/check-types-7.4.0.tgz",
+ "integrity": "sha512-YbulWHdfP99UfZ73NcUDlNJhEIDgm9Doq9GhpyXbF+7Aegi3CVV7qqMCKTTqJxlvEvnQBp9IA+dxsGN6xK/nSg=="
+ },
+ "is-url": {
+ "version": "1.2.2",
+ "resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.2.tgz",
+ "integrity": "sha1-SYkFpZO/R8wtnn9zg3K792lsfyY="
+ }
+ }
+ },
+ "tesseract.js-core": {
+ "version": "2.0.0-beta.11",
+ "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-2.0.0-beta.11.tgz",
+ "integrity": "sha512-07haKH2JYYo0OfIJoioMS9dDiI5Hrl7+r1MqjeNAAT5WpKO0ATe4cpncC8s1kz0e3s1kaC5WOwL3YJcjbJE+hg=="
+ },
+ "tesseract.js-utils": {
+ "version": "1.0.0-beta.8",
+ "resolved": "https://registry.npmjs.org/tesseract.js-utils/-/tesseract.js-utils-1.0.0-beta.8.tgz",
+ "integrity": "sha512-qjHBfWfzo2o1ZY9XI0Wh2hmpp38+mIgCMOk60W5Yyie/pBl421VLBKOZUEwQgpbLnOJ24VU6Q8yXsVgtFFHcFg==",
+ "requires": {
+ "axios": "^0.18.0",
+ "bmp-js": "^0.1.0",
+ "file-type": "^10.5.0",
+ "idb-keyval": "^3.1.0",
+ "is-url": "^1.2.4",
+ "zlibjs": "^0.3.1"
+ },
+ "dependencies": {
+ "file-type": {
+ "version": "10.11.0",
+ "resolved": "https://registry.npmjs.org/file-type/-/file-type-10.11.0.tgz",
+ "integrity": "sha512-uzk64HRpUZyTGZtVuvrjP0FYxzQrBf4rojot6J65YMEbwBLB0CWm0CLojVpwpmFmxcE/lkvYICgfcGozbBq6rw=="
+ }
+ }
+ },
"text-table": {
"version": "0.2.0",
"resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz",
diff --git a/package.json b/package.json
index 0b8c014b..ff40c317 100644
--- a/package.json
+++ b/package.json
@@ -143,6 +143,7 @@
"sortablejs": "^1.9.0",
"split.js": "^1.5.11",
"ssdeep.js": "0.0.2",
+ "tesseract.js": "^2.0.0-alpha.15",
"ua-parser-js": "^0.7.20",
"utf8": "^3.0.0",
"vkbeautify": "^0.99.3",
diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json
index 49ef774c..94f7fd30 100755
--- a/src/core/config/Categories.json
+++ b/src/core/config/Categories.json
@@ -383,6 +383,7 @@
"ops": [
"Render Image",
"Play Media",
+ "Optical Character Recognition",
"Remove EXIF",
"Extract EXIF",
"Split Colour Channels",
diff --git a/src/core/operations/OpticalCharacterRecognition.mjs b/src/core/operations/OpticalCharacterRecognition.mjs
new file mode 100644
index 00000000..ccfbcd1f
--- /dev/null
+++ b/src/core/operations/OpticalCharacterRecognition.mjs
@@ -0,0 +1,77 @@
+/**
+ * @author n1474335 [n1474335@gmail.com]
+ * @author mshwed [m@ttshwed.com]
+ * @copyright Crown Copyright 2019
+ * @license Apache-2.0
+ */
+
+import Operation from "../Operation.mjs";
+import OperationError from "../errors/OperationError.mjs";
+import { isImage } from "../lib/FileType.mjs";
+import { toBase64 } from "../lib/Base64.mjs";
+import { isWorkerEnvironment } from "../Utils.mjs";
+
+import Tesseract from "tesseract.js";
+const { TesseractWorker } = Tesseract;
+
+/**
+ * Optical Character Recognition operation
+ */
+class OpticalCharacterRecognition extends Operation {
+
+ /**
+ * OpticalCharacterRecognition constructor
+ */
+ constructor() {
+ super();
+
+ this.name = "Optical Character Recognition";
+ this.module = "Image";
+ this.description = "Optical character recognition or optical character reader (OCR) is the mechanical or electronic conversion of images of typed, handwritten or printed text into machine-encoded text.
Supported image formats: png, jpg, bmp, pbm.";
+ this.infoURL = "https://wikipedia.org/wiki/Optical_character_recognition";
+ this.inputType = "ArrayBuffer";
+ this.outputType = "string";
+ this.args = [
+ {
+ name: "Show confidence",
+ type: "boolean",
+ value: true
+ }
+ ];
+ }
+
+ /**
+ * @param {ArrayBuffer} input
+ * @param {Object[]} args
+ * @returns {string}
+ */
+ async run(input, args) {
+ const [showConfidence] = args;
+
+ const type = isImage(input);
+ if (!type) {
+ throw new OperationError("Invalid File Type");
+ }
+
+ try {
+ const image = `data:${type};base64,${toBase64(input)}`;
+ const worker = new TesseractWorker();
+ const result = await worker.recognize(image)
+ .progress(progress => {
+ if (isWorkerEnvironment()) {
+ self.sendStatusMessage(`Status: ${progress.status} - ${(parseFloat(progress.progress)*100).toFixed(2)}%`);
+ }
+ });
+
+ if (showConfidence) {
+ return `Confidence: ${result.confidence}%\n\n${result.text}`;
+ } else {
+ return result.text;
+ }
+ } catch (err) {
+ throw new OperationError(`Error performing OCR on image. (${err})`);
+ }
+ }
+}
+
+export default OpticalCharacterRecognition;
diff --git a/tests/operations/tests/Image.mjs b/tests/operations/tests/Image.mjs
index ea5c64a4..307420f2 100644
--- a/tests/operations/tests/Image.mjs
+++ b/tests/operations/tests/Image.mjs
@@ -247,5 +247,20 @@ TestRegister.addTests([
args: ["None"]
}
]
- }
+ },
+ /*{ Commented out as it takes a while to run and drops a file to disk (eng.traineddata)
+ name: "Optical Character Recognition",
+ input: "iVBORw0KGgoAAAANSUhEUgAAAUAAAAC0CAIAAABqhmJGAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAASuSURBVHhe7dftVdswAIbRzsVAzMM0XabDUCOUxLYsWW4Jp+/pvf9w9GH76CHw4x2IJWAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAI9p8G/PbyY8rL2686g8t+vnqHTyfgIYfvz/26veTXn/UKX8+f0EU9bHrtu/6KfAN/AwEXAj7lFf2TBFw4nae8on+SgIvJ01n/KLzpDK+L3bT/Ap4O+HC+V12mTH+M3gzcLbIY/EO6HfxYp13k09nb6r3UqcdnjoCL3ll72J26h+35Oxy2XvZ0wOLaXq9v2+F1UC+7RZtMZ/DnfX1lwDOPzwUCLo7O2trtDK8H3M/iqoc6bj1subT68XTA/F7bGJooyzKbhTvLPHY8eJLHlbNX1DqYUVfdXbqwJjsCLsans37aNNJM6w68OR0wv9f9ymKw3k67yn2ZZpHlg3a3zis60s6oV+ZvlzMCLoanc3Dsdt9TdWT/lM8OmNjr5KY72jmzq1zfrbvXtVtmRMDF8HTWcgaaqIrD1U4G/MFewxrW262s5jS/Fzpmdts6mnHy+Fwl4GJ0OjsNrG1P/y7CNo3+gEt7jW56MVprNed7A/5w+n6YJ+BieDpnj/jO6pweTz0acGWvmZveL9XOmd3x6wKuTt8PEwRczLRw4eje1XX7c/cDruw1uuneOu2c4aOvzI57mJhRh1xZlQ0BF+Oz9vcF96fuB1zYa7R2b5mD6/XSwdfg8snj4q21+W/L02dfzIxhQMDFyTm6Hd7m+JYP7rPKT5sRuzhOBywm91rUkYc3fV9ltchtr8VmzuGOdfDB9N1tFYefNfdXLmyGjNZkhoCLUQufVqd/7z7rUcLW/XieDvg0s9difNOdRV5ePibt5vTuazusWbF9rs2E5v4mH58LBFyMW7g5OID7s9cMuTygmt9rcNPb5MrAz0lHc3Z9Ht7XZsxqxO36ZtLR/c0+PpMEzLOc/4LhrwmYZ6lfywJ+JgHzJPr9DgLmi23/zdXvcwmYL7YKWL1PJ2AIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmCI9f7+G6yFxVg/GyYwAAAAAElFTkSuQmCC",
+ expectedOutput: "Tesseract.js\n",
+ recipeConfig: [
+ {
+ "op": "From Base64",
+ "args": ["A-Za-z0-9+/=", true]
+ },
+ {
+ "op": "Optical Character Recognition",
+ "args": [false]
+ }
+ ]
+ }*/
]);