From 7cfb5e0b2a42622500a60054eff1c00b0ced26ba Mon Sep 17 00:00:00 2001
From: sw5678 <151949597+sw5678@users.noreply.github.com>
Date: Mon, 12 Feb 2024 14:52:46 +0000
Subject: [PATCH 1/3] Added RAKE functionality into CC
---
src/core/config/Categories.json | 3 +-
src/core/operations/RAKE.mjs | 149 ++++++++++++++++++++++++++++++++
tests/operations/index.mjs | 1 +
tests/operations/tests/RAKE.mjs | 22 +++++
4 files changed, 174 insertions(+), 1 deletion(-)
create mode 100644 src/core/operations/RAKE.mjs
create mode 100644 tests/operations/tests/RAKE.mjs
diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json
index cf4d91be..ab26cfe0 100644
--- a/src/core/config/Categories.json
+++ b/src/core/config/Categories.json
@@ -328,7 +328,8 @@
"CSS selector",
"Extract EXIF",
"Extract ID3",
- "Extract Files"
+ "Extract Files",
+ "RAKE"
]
},
{
diff --git a/src/core/operations/RAKE.mjs b/src/core/operations/RAKE.mjs
new file mode 100644
index 00000000..d1165b51
--- /dev/null
+++ b/src/core/operations/RAKE.mjs
@@ -0,0 +1,149 @@
+/**
+ * @author sw5678
+ * @copyright Crown Copyright 2024
+ * @license Apache-2.0
+ */
+
+import Operation from "../Operation.mjs";
+
+/**
+ * RAKE operation
+ */
+class RAKE extends Operation {
+
+ /**
+ * RAKE constructor
+ */
+ constructor() {
+ super();
+
+ this.name = "RAKE";
+ this.module = "Default";
+ this.description = [
+ "Rapid Keyword Extraction (RAKE)",
+ "
",
+ "RAKE is a domain-independent keyword extraction algorithm in Natural Language Processing.",
+ "
",
+ "The list of stop words are from the NLTK python package",
+ ].join("\n");
+ this.inputType = "string";
+ this.outputType = "string";
+ this.args = [
+ {
+ name: "Word Delimiter (Regex)",
+ type: "text",
+ value: "\\s"
+ },
+ {
+ name: "Sentence Delimiter (Regex)",
+ type: "text",
+ value: "\\.\\s|\\n"
+ },
+ {
+ name: "Stop Words",
+ type: "text",
+ value: "i,me,my,myself,we,our,ours,ourselves,you,you're,you've,you'll,you'd,your,yours,yourself,yourselves,he,him,his,himself,she,she's,her,hers,herself,it,it's,its,itsef,they,them,their,theirs,themselves,what,which,who,whom,this,that,that'll,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does',did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,don't,should,should've,now,d,ll,m,o,re,ve,y,ain,aren,aren't,couldn,couldn't,didn,didn't,doesn,doesn't,hadn,hadn't,hasn,hasn't,haven,haven't,isn,isn't,ma,mightn,mightn't,mustn,mustn't,needn,needn't,shan,shan't,shouldn,shouldn't,wasn,wasn't,weren,weren't,won,won't,wouldn,wouldn't"
+ }
+ ];
+ }
+
+ /**
+ * @param {string} input
+ * @param {Object[]} args
+ * @returns {string}
+ */
+ run(input, args) {
+
+ // Get delimiter regexs
+ const wordDelim = new RegExp(args[0], "g");
+ const sentDelim = new RegExp(args[1], "g");
+
+ // Deduplicate the stop words and add the empty string
+ const stopWords = args[2].toLowerCase().replace(/ /g, "").split(",").unique();
+ stopWords.push("");
+
+ // Lower case input and remove start and ending whitespace
+ input = input.toLowerCase().trim();
+
+ // Get tokens, token count, and phrases
+ const tokens = [];
+ const wordFrequencies = [];
+ let phrases = [];
+
+ // Build up list of phrases and token counts
+ const sentences = input.split(sentDelim);
+ for (const sent of sentences) {
+
+ // Split sentence into words
+ const splitSent = sent.split(wordDelim);
+ let startIndex = 0;
+
+ for (let i = 0; i < splitSent.length; i++) {
+ const token = splitSent[i];
+ if (stopWords.includes(token)) {
+ // If token is stop word then split to create phrase
+ phrases.push(splitSent.slice(startIndex, i));
+ startIndex = i + 1;
+ } else {
+ // If token is not a stop word add to the count of the list of words
+ if (tokens.includes(token)) {
+ wordFrequencies[tokens.indexOf(token)]+=1;
+ } else {
+ tokens.push(token);
+ wordFrequencies.push(1);
+ }
+ }
+ }
+ phrases.push(splitSent.slice(startIndex));
+ }
+
+ // remove empty phrases
+ phrases = phrases.filter(subArray => subArray.length > 0);
+
+ // Remove duplicate phrases
+ const uniquePhrases = [...new Set(phrases.map(function (phrase) {
+ return phrase.join(" ");
+ }))];
+ phrases = uniquePhrases.map(function (phrase) {
+ return phrase.split(" ");
+ });
+
+ // Generate word_degree_matrix and populate
+ const wordDegreeMatrix = Array.from(Array(tokens.length), _ => Array(tokens.length).fill(0));
+ phrases.forEach(function (phrase) {
+ phrase.forEach(function (word1) {
+ phrase.forEach(function (word2) {
+ wordDegreeMatrix[tokens.indexOf(word1)][tokens.indexOf(word2)]++;
+ });
+ });
+ });
+
+ // Calculate degree score for each token
+ const degreeScores = Array(tokens.length).fill(0);
+ for (let i=0; i b[0] - a[0]);
+ scores.unshift(new Array("Scores: ", "Keywords: "));
+
+ // Output works with the 'To Table' functionality already built into CC
+ return scores.map(function (score) {
+ return score.join(", ");
+ }).join("\n");
+ }
+}
+
+export default RAKE;
diff --git a/tests/operations/index.mjs b/tests/operations/index.mjs
index 570fbb6f..169ce844 100644
--- a/tests/operations/index.mjs
+++ b/tests/operations/index.mjs
@@ -74,6 +74,7 @@ import "./tests/ParseIPRange.mjs";
import "./tests/ParseQRCode.mjs";
import "./tests/PEMtoHex.mjs";
import "./tests/PowerSet.mjs";
+import "./tests/RAKE.mjs";
import "./tests/Regex.mjs";
import "./tests/Register.mjs";
import "./tests/Rotate.mjs";
diff --git a/tests/operations/tests/RAKE.mjs b/tests/operations/tests/RAKE.mjs
new file mode 100644
index 00000000..8164ca01
--- /dev/null
+++ b/tests/operations/tests/RAKE.mjs
@@ -0,0 +1,22 @@
+/**
+ * RAKE, Rapid Automatic Keyword Extraction tests.
+ *
+ * @author sw5678
+ * @copyright Crown Copyright 2024
+ * @license Apache-2.0
+ */
+import TestRegister from "../../lib/TestRegister.mjs";
+
+TestRegister.addTests([
+ {
+ "name": "RAKE: Basic Example",
+ "input": "test1 test2. test2",
+ "expectedOutput": "Scores: , Keywords: \n3.5, test1 test2\n1.5, test2",
+ "recipeConfig": [
+ {
+ "op": "RAKE",
+ "args": ["\\s", "\\.\\s|\\n", "i,me,my,myself,we,our"]
+ },
+ ],
+ }
+]);
From 774828823cf3264f918daa4dd9fcc495beb8bf14 Mon Sep 17 00:00:00 2001
From: sw5678 <151949597+sw5678@users.noreply.github.com>
Date: Tue, 13 Feb 2024 11:52:19 +0000
Subject: [PATCH 2/3] Adding RAKE test import back after merge conflict
---
tests/operations/index.mjs | 1 +
1 file changed, 1 insertion(+)
diff --git a/tests/operations/index.mjs b/tests/operations/index.mjs
index 98374650..0a8fe316 100644
--- a/tests/operations/index.mjs
+++ b/tests/operations/index.mjs
@@ -117,6 +117,7 @@ import "./tests/PHP.mjs";
import "./tests/PowerSet.mjs";
import "./tests/Protobuf.mjs";
import "./tests/Rabbit.mjs";
+import "./tests/RAKE.mjs";
import "./tests/Regex.mjs";
import "./tests/Register.mjs";
import "./tests/RisonEncodeDecode.mjs";
From 63449872da39c3bc301a833c49a41901ee9949ed Mon Sep 17 00:00:00 2001
From: GoForceX <67143590+GoForceX@users.noreply.github.com>
Date: Wed, 14 Feb 2024 10:56:00 +0000
Subject: [PATCH 3/3] Fix JSON folder folding in Firefox
---
src/web/stylesheets/operations/json.css | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/web/stylesheets/operations/json.css b/src/web/stylesheets/operations/json.css
index 22c07128..27861c8f 100644
--- a/src/web/stylesheets/operations/json.css
+++ b/src/web/stylesheets/operations/json.css
@@ -44,7 +44,8 @@ ul.json-dict, ol.json-array {
display: contents;
}
.json-summary {
- display: contents;
+ display: inline;
+ list-style: none;
}
/* Display object and array brackets when closed */