mirror of
https://github.com/gchq/CyberChef.git
synced 2025-06-13 17:55:27 -04:00
Merge 6324a3a808
into c57556f49f
This commit is contained in:
commit
59bebc1ed7
6 changed files with 291 additions and 0 deletions
6
package-lock.json
generated
6
package-lock.json
generated
|
@ -46,6 +46,7 @@
|
|||
"file-saver": "^2.0.5",
|
||||
"flat": "^6.0.1",
|
||||
"geodesy": "1.1.3",
|
||||
"gpt-tokenizer": "^2.9.0",
|
||||
"handlebars": "^4.7.8",
|
||||
"hash-wasm": "^4.12.0",
|
||||
"highlight.js": "^11.9.0",
|
||||
|
@ -10361,6 +10362,11 @@
|
|||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/gpt-tokenizer": {
|
||||
"version": "2.9.0",
|
||||
"resolved": "https://registry.npmjs.org/gpt-tokenizer/-/gpt-tokenizer-2.9.0.tgz",
|
||||
"integrity": "sha512-YSpexBL/k4bfliAzMrRqn3M6+it02LutVyhVpDeMKrC/O9+pCe/5s8U2hYKa2vFLD5/vHhsKc8sOn/qGqII8Kg=="
|
||||
},
|
||||
"node_modules/graceful-fs": {
|
||||
"version": "4.2.11",
|
||||
"resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
|
||||
|
|
|
@ -132,6 +132,7 @@
|
|||
"file-saver": "^2.0.5",
|
||||
"flat": "^6.0.1",
|
||||
"geodesy": "1.1.3",
|
||||
"gpt-tokenizer": "^2.9.0",
|
||||
"handlebars": "^4.7.8",
|
||||
"hash-wasm": "^4.12.0",
|
||||
"highlight.js": "^11.9.0",
|
||||
|
|
|
@ -3,6 +3,10 @@
|
|||
"name": "Favourites",
|
||||
"ops": []
|
||||
},
|
||||
{
|
||||
"name": "AI",
|
||||
"ops": ["Count AI Tokens", "Parse AI Tokens"]
|
||||
},
|
||||
{
|
||||
"name": "Data format",
|
||||
"ops": [
|
||||
|
|
71
src/core/lib/GPTTokenizer.mjs
Normal file
71
src/core/lib/GPTTokenizer.mjs
Normal file
|
@ -0,0 +1,71 @@
|
|||
// noinspection SpellCheckingInspection
|
||||
|
||||
/**
|
||||
* @author grmartin [grmartin@engineer.com]
|
||||
* @copyright Crown Copyright 2016
|
||||
* @license Apache-2.0
|
||||
*/
|
||||
|
||||
const exportModule = (m) => {
|
||||
return {
|
||||
countTokens: m.countTokens, // # of tokens
|
||||
encode: m.encode, // tokens ids
|
||||
decodeGenerator: m.decodeGenerator, // tokens
|
||||
};
|
||||
};
|
||||
|
||||
export const defaultValue = Symbol("*");
|
||||
|
||||
// Tokenizer module constants
|
||||
const GPT_35_TURBO_TOKENIZER = () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => exportModule(m));
|
||||
const TEXT_EMBEDDING_ADA_002_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-ada-002").then(m => exportModule(m));
|
||||
const TEXT_EMBEDDING_3_LARGE_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-large").then(m => exportModule(m));
|
||||
const TEXT_EMBEDDING_3_SMALL_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-small").then(m => exportModule(m));
|
||||
const CODE_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-002").then(m => exportModule(m));
|
||||
const CODE_CUSHMAN_002_TOKENIZER = () => import("gpt-tokenizer/model/code-cushman-002").then(m => exportModule(m));
|
||||
const TEXT_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-002").then(m => exportModule(m));
|
||||
const TEXT_DAVINCI_003_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-003").then(m => exportModule(m));
|
||||
const TEXT_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-edit-001").then(m => exportModule(m));
|
||||
const CODE_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-edit-001").then(m => exportModule(m));
|
||||
const DAVINCI_TOKENIZER = () => import("gpt-tokenizer/model/davinci").then(m => exportModule(m));
|
||||
const CURIE_TOKENIZER = () => import("gpt-tokenizer/model/curie").then(m => exportModule(m));
|
||||
const BABBAGE_TOKENIZER = () => import("gpt-tokenizer/model/babbage").then(m => exportModule(m));
|
||||
const ADA_TOKENIZER = () => import("gpt-tokenizer/model/ada").then(m => exportModule(m));
|
||||
|
||||
// This mapping returns a Promise that resolves to the correct countTokens function for the model.
|
||||
export const MODEL_TO_MODULES = {
|
||||
// cl100k_base models
|
||||
[defaultValue]: GPT_35_TURBO_TOKENIZER,
|
||||
"gpt-4": GPT_35_TURBO_TOKENIZER,
|
||||
"gpt-4-32k": GPT_35_TURBO_TOKENIZER,
|
||||
"gpt-4-turbo": GPT_35_TURBO_TOKENIZER,
|
||||
"gpt-4o": GPT_35_TURBO_TOKENIZER,
|
||||
"gpt-4-0125-preview": GPT_35_TURBO_TOKENIZER,
|
||||
"gpt-4-1106-preview": GPT_35_TURBO_TOKENIZER,
|
||||
"gpt-3.5-turbo": GPT_35_TURBO_TOKENIZER,
|
||||
"gpt-3.5-turbo-16k": GPT_35_TURBO_TOKENIZER,
|
||||
"gpt-3.5-turbo-instruct": GPT_35_TURBO_TOKENIZER,
|
||||
"gpt-3.5-turbo-0125": GPT_35_TURBO_TOKENIZER,
|
||||
"gpt-3.5-turbo-1106": GPT_35_TURBO_TOKENIZER,
|
||||
"text-embedding-ada-002": TEXT_EMBEDDING_ADA_002_TOKENIZER,
|
||||
"text-embedding-3-large": TEXT_EMBEDDING_3_LARGE_TOKENIZER,
|
||||
"text-embedding-3-small": TEXT_EMBEDDING_3_SMALL_TOKENIZER,
|
||||
|
||||
// p50k_base models
|
||||
"code-davinci-002": CODE_DAVINCI_002_TOKENIZER,
|
||||
"code-davinci-001": CODE_DAVINCI_002_TOKENIZER,
|
||||
"code-cushman-002": CODE_CUSHMAN_002_TOKENIZER,
|
||||
"code-cushman-001": CODE_CUSHMAN_002_TOKENIZER,
|
||||
"text-davinci-002": TEXT_DAVINCI_002_TOKENIZER,
|
||||
"text-davinci-003": TEXT_DAVINCI_003_TOKENIZER,
|
||||
|
||||
// p50k_edit models
|
||||
"text-davinci-edit-001": TEXT_DAVINCI_EDIT_001_TOKENIZER,
|
||||
"code-davinci-edit-001": CODE_DAVINCI_EDIT_001_TOKENIZER,
|
||||
|
||||
// r50k_base models
|
||||
"davinci": DAVINCI_TOKENIZER,
|
||||
"curie": CURIE_TOKENIZER,
|
||||
"babbage": BABBAGE_TOKENIZER,
|
||||
"ada": ADA_TOKENIZER,
|
||||
};
|
59
src/core/operations/CountAITokens.mjs
Normal file
59
src/core/operations/CountAITokens.mjs
Normal file
|
@ -0,0 +1,59 @@
|
|||
/**
|
||||
* @author grmartin [grmartin@engineer.com]
|
||||
* @copyright Crown Copyright 2016
|
||||
* @license Apache-2.0
|
||||
*/
|
||||
|
||||
import Operation from "../Operation.mjs";
|
||||
import {defaultValue, MODEL_TO_MODULES} from "../lib/GPTTokenizer.mjs";
|
||||
|
||||
/**
|
||||
* Count AI Tokens operation
|
||||
*/
|
||||
class CountAITokens extends Operation {
|
||||
|
||||
/**
|
||||
* Count AI Tokens constructor
|
||||
*/
|
||||
constructor() {
|
||||
super();
|
||||
|
||||
this.name = "Count AI Tokens";
|
||||
this.module = "AI";
|
||||
this.infoURL = "https://github.com/niieani/gpt-tokenizer";
|
||||
this.description = "Counts the number of GPT tokens in the input text using niieani/gpt-tokenizer. Select the model to use the correct encoding.";
|
||||
this.inputType = "string";
|
||||
this.outputType = "string";
|
||||
this.args = [
|
||||
{
|
||||
name: "Model",
|
||||
type: "option",
|
||||
value: Object.keys(MODEL_TO_MODULES),
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} input
|
||||
* @param {Object[]} args
|
||||
* @returns {string}
|
||||
*/
|
||||
async run(input, args) {
|
||||
if (!input) return "";
|
||||
|
||||
const [model] = args;
|
||||
let countTokensFn;
|
||||
if (MODEL_TO_MODULES[model]) {
|
||||
countTokensFn = (await MODEL_TO_MODULES[model]()).countTokens;
|
||||
} else {
|
||||
// fallback to default (gpt-3.5-turbo encoding)
|
||||
countTokensFn = (await MODEL_TO_MODULES[defaultValue]()).countTokens;
|
||||
}
|
||||
const tokenCount = countTokensFn(input);
|
||||
return tokenCount.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
export default CountAITokens;
|
||||
|
150
src/core/operations/ParseAITokens.mjs
Normal file
150
src/core/operations/ParseAITokens.mjs
Normal file
|
@ -0,0 +1,150 @@
|
|||
/**
|
||||
* @author grmartin [grmartin@engineer.com]
|
||||
* @copyright Crown Copyright 2016
|
||||
* @license Apache-2.0
|
||||
*/
|
||||
|
||||
import Operation from "../Operation.mjs";
|
||||
import {defaultValue, MODEL_TO_MODULES} from "../lib/GPTTokenizer.mjs";
|
||||
|
||||
const pastelColors = [
|
||||
"rgba(102,197,204,.4)",
|
||||
"rgba(246,207,113,.4)",
|
||||
"rgba(248,156,116,.4)",
|
||||
"rgba(239,65,70,.4)",
|
||||
"rgba(220,176,242,.4)",
|
||||
"rgba(135,197,95,.4)",
|
||||
"rgba(158,185,243,.4)",
|
||||
"rgba(254,136,177,.4)",
|
||||
"rgba(201,219,116,.4)",
|
||||
"rgba(139,224,164,.4)",
|
||||
"rgba(180,151,231,.4)",
|
||||
];
|
||||
|
||||
/**
|
||||
* Count AI Tokens operation
|
||||
*/
|
||||
class ParseAITokens extends Operation {
|
||||
|
||||
/**
|
||||
* Parse AI Tokens constructor
|
||||
*/
|
||||
constructor() {
|
||||
super();
|
||||
|
||||
this.name = "Parse AI Tokens";
|
||||
this.module = "AI";
|
||||
this.infoURL = "https://github.com/niieani/gpt-tokenizer";
|
||||
this.description = "Parses the GPT tokens in the input text using niieani/gpt-tokenizer. Select the model to use the correct encoding.";
|
||||
this.inputType = "string";
|
||||
this.outputType = "html";
|
||||
this.args = [
|
||||
{
|
||||
name: "Model",
|
||||
type: "option",
|
||||
value: Object.keys(MODEL_TO_MODULES),
|
||||
},
|
||||
{
|
||||
name: "Show Token IDs",
|
||||
type: "boolean",
|
||||
value: false
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} input
|
||||
* @param {Object[]} args
|
||||
* @returns {string}
|
||||
*/
|
||||
async run(input, args) {
|
||||
if (!input) return "";
|
||||
|
||||
const [model, showIds] = args;
|
||||
let fns;
|
||||
if (MODEL_TO_MODULES[model]) {
|
||||
fns = (await MODEL_TO_MODULES[model]());
|
||||
} else {
|
||||
// fallback to default (gpt-3.5-turbo encoding)
|
||||
fns = (await MODEL_TO_MODULES[defaultValue]());
|
||||
}
|
||||
|
||||
const encodedTokens = fns.encode(input); // IDs
|
||||
|
||||
let displayTokens;
|
||||
if (showIds) {
|
||||
displayTokens = encodedTokens.map((x)=> x.toString());
|
||||
} else {
|
||||
const tokens = [];
|
||||
for (const token of fns.decodeGenerator(encodedTokens)) {
|
||||
tokens.push(token);
|
||||
}
|
||||
displayTokens = tokens;
|
||||
}
|
||||
|
||||
return this.format(input, displayTokens);
|
||||
|
||||
};
|
||||
|
||||
/**
|
||||
* Format HTML
|
||||
* @param {string} input
|
||||
* @param {string[]} tokens
|
||||
*/
|
||||
format(input, tokens) {
|
||||
|
||||
const tokenHtml = tokens.map((t, i) => {
|
||||
const tok =
|
||||
t.replace(/[\u00A0-\u9999<>&]/g, i => "&#"+i.charCodeAt(0)+";")
|
||||
.replaceAll(" ", "\u00A0")
|
||||
.replaceAll("\n", "<newline>");
|
||||
|
||||
const css = [
|
||||
`background-color:${pastelColors[i % pastelColors.length]}`,
|
||||
"padding: 0 0",
|
||||
"border-radius: 3px",
|
||||
"margin-right: 0",
|
||||
"margin-bottom: 4px",
|
||||
"display: 'inline-block'",
|
||||
"height: 1.5em"
|
||||
];
|
||||
|
||||
return `<span style="${css.join(";")}">${tok}</span>`;
|
||||
});
|
||||
|
||||
return this.replaceSpacesOutsideTags(`
|
||||
<div style="padding: 0; margin: 0">
|
||||
<h1>Tokens</h1>
|
||||
<p style="font-family: monospace">
|
||||
${tokenHtml.join("")}
|
||||
</p>
|
||||
<hr />
|
||||
<ul style="list-style: none; padding-left: 0">
|
||||
<li><strong>Characters:</strong> ${input.length}</li>
|
||||
<li><strong>Tokens:</strong> ${tokens.length}</li>
|
||||
</ul>
|
||||
</div>`
|
||||
);
|
||||
};
|
||||
|
||||
/**
|
||||
* Replace spaces outside HTML tags and sanitize <script> tags.
|
||||
* @param {string} htmlString - The input HTML string.
|
||||
* @returns {string} - The sanitized and formatted HTML string.
|
||||
*/
|
||||
replaceSpacesOutsideTags(htmlString) {
|
||||
return htmlString
|
||||
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/ig, "")
|
||||
.replace(/(<[^>]*?>)|(\s+)/g, function(match, tag, spaces) {
|
||||
if (tag) {
|
||||
return tag;
|
||||
} else if (spaces) {
|
||||
return "";
|
||||
}
|
||||
})
|
||||
.replace(/[\r\n]/g, "");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
export default ParseAITokens;
|
Loading…
Add table
Add a link
Reference in a new issue