mirror of
https://github.com/gchq/CyberChef.git
synced 2025-06-14 10:14:53 -04:00
Adding GPT Token Parser display
This commit is contained in:
parent
8443330abd
commit
dd583a4943
3 changed files with 150 additions and 123 deletions
|
@ -5,7 +5,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "AI",
|
"name": "AI",
|
||||||
"ops": ["Count AI Tokens"]
|
"ops": ["Count AI Tokens", "Parse AI Tokens"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "Data format",
|
"name": "Data format",
|
||||||
|
|
|
@ -6,104 +6,30 @@
|
||||||
* @license Apache-2.0
|
* @license Apache-2.0
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert an imported module in to a solid type
|
|
||||||
* @param m an imported module
|
|
||||||
* @returns {TokenizerModule}
|
|
||||||
*/
|
|
||||||
const exportModule = (m) => {
|
const exportModule = (m) => {
|
||||||
return {
|
return {
|
||||||
countTokens: m.countTokens, // # of tokens
|
countTokens: m.countTokens, // # of tokens
|
||||||
encode: m.encode, // tokens
|
encode: m.encode, // tokens ids
|
||||||
decode: m.decode, // token ids
|
decodeGenerator: m.decodeGenerator, // tokens
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
export const defaultValue = Symbol("*");
|
export const defaultValue = Symbol("*");
|
||||||
|
|
||||||
// Tokenizer module constants
|
// Tokenizer module constants
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const GPT_35_TURBO_TOKENIZER = () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => exportModule(m));
|
const GPT_35_TURBO_TOKENIZER = () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => exportModule(m));
|
||||||
|
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const TEXT_EMBEDDING_ADA_002_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-ada-002").then(m => exportModule(m));
|
const TEXT_EMBEDDING_ADA_002_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-ada-002").then(m => exportModule(m));
|
||||||
|
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const TEXT_EMBEDDING_3_LARGE_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-large").then(m => exportModule(m));
|
const TEXT_EMBEDDING_3_LARGE_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-large").then(m => exportModule(m));
|
||||||
|
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const TEXT_EMBEDDING_3_SMALL_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-small").then(m => exportModule(m));
|
const TEXT_EMBEDDING_3_SMALL_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-small").then(m => exportModule(m));
|
||||||
|
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const CODE_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-002").then(m => exportModule(m));
|
const CODE_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-002").then(m => exportModule(m));
|
||||||
|
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const CODE_CUSHMAN_002_TOKENIZER = () => import("gpt-tokenizer/model/code-cushman-002").then(m => exportModule(m));
|
const CODE_CUSHMAN_002_TOKENIZER = () => import("gpt-tokenizer/model/code-cushman-002").then(m => exportModule(m));
|
||||||
|
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const TEXT_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-002").then(m => exportModule(m));
|
const TEXT_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-002").then(m => exportModule(m));
|
||||||
|
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const TEXT_DAVINCI_003_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-003").then(m => exportModule(m));
|
const TEXT_DAVINCI_003_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-003").then(m => exportModule(m));
|
||||||
|
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const TEXT_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-edit-001").then(m => exportModule(m));
|
const TEXT_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-edit-001").then(m => exportModule(m));
|
||||||
|
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const CODE_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-edit-001").then(m => exportModule(m));
|
const CODE_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-edit-001").then(m => exportModule(m));
|
||||||
|
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const DAVINCI_TOKENIZER = () => import("gpt-tokenizer/model/davinci").then(m => exportModule(m));
|
const DAVINCI_TOKENIZER = () => import("gpt-tokenizer/model/davinci").then(m => exportModule(m));
|
||||||
|
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const CURIE_TOKENIZER = () => import("gpt-tokenizer/model/curie").then(m => exportModule(m));
|
const CURIE_TOKENIZER = () => import("gpt-tokenizer/model/curie").then(m => exportModule(m));
|
||||||
|
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const BABBAGE_TOKENIZER = () => import("gpt-tokenizer/model/babbage").then(m => exportModule(m));
|
const BABBAGE_TOKENIZER = () => import("gpt-tokenizer/model/babbage").then(m => exportModule(m));
|
||||||
|
|
||||||
/**
|
|
||||||
* @returns {Promise<TokenizerModule>}
|
|
||||||
* @constructor
|
|
||||||
*/
|
|
||||||
const ADA_TOKENIZER = () => import("gpt-tokenizer/model/ada").then(m => exportModule(m));
|
const ADA_TOKENIZER = () => import("gpt-tokenizer/model/ada").then(m => exportModule(m));
|
||||||
|
|
||||||
// This mapping returns a Promise that resolves to the correct countTokens function for the model.
|
// This mapping returns a Promise that resolves to the correct countTokens function for the model.
|
||||||
|
@ -143,49 +69,3 @@ export const MODEL_TO_MODULES = {
|
||||||
"babbage": BABBAGE_TOKENIZER,
|
"babbage": BABBAGE_TOKENIZER,
|
||||||
"ada": ADA_TOKENIZER,
|
"ada": ADA_TOKENIZER,
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* @typedef {Object} EncodeOptions
|
|
||||||
* @property {Set<string>|'all'} [allowedSpecial] - A list of special tokens that are allowed in the input.
|
|
||||||
* If set to 'all', all special tokens are allowed except those in disallowedSpecial.
|
|
||||||
* @default undefined
|
|
||||||
* @property {Set<string>|'all'} [disallowedSpecial] - A list of special tokens that are disallowed in the input.
|
|
||||||
* If set to 'all', all special tokens are disallowed except those in allowedSpecial.
|
|
||||||
* @default 'all'
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @typedef {Object} ChatMessage
|
|
||||||
* @property {'system'|'user'|'assistant'} [role] - The role of the message sender.
|
|
||||||
* @property {string} [name] - The name of the message sender.
|
|
||||||
* @property {string} content - The content of the message.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @func EncodeFn
|
|
||||||
* @param {string} lineToEncode - The string to encode.
|
|
||||||
* @param {EncodeOptions} [encodeOptions] - Optional encoding options.
|
|
||||||
* @returns {number[]} An array of numbers representing the encoded result.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @func DecodeFn
|
|
||||||
* @param {Iterable<number>} inputTokensToDecode - An iterable collection of numbers to decode.
|
|
||||||
* @returns {string} The decoded string.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A function that counts tokens.
|
|
||||||
*
|
|
||||||
* @func CountTokensFn
|
|
||||||
* @param {string | Iterable<ChatMessage>} input - The input string or an iterable of ChatMessage objects.
|
|
||||||
* @param {EncodeOptions} [encodeOptions] - Optional encoding options to customize the token counting process.
|
|
||||||
* @returns {number} The total number of tokens counted.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @typedef {Object} TokenizerModule
|
|
||||||
* @property {CountTokensFn} countTokens - Function to count tokens in input
|
|
||||||
* @property {DecodeFn} decode - Function to convert token IDs back to text
|
|
||||||
* @property {EncodeFn} encode - Function to convert text to token IDs
|
|
||||||
*/
|
|
||||||
|
|
147
src/core/operations/ParseAITokens.mjs
Normal file
147
src/core/operations/ParseAITokens.mjs
Normal file
|
@ -0,0 +1,147 @@
|
||||||
|
/**
|
||||||
|
* @author grmartin [grmartin@engineer.com]
|
||||||
|
* @copyright Crown Copyright 2016
|
||||||
|
* @license Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
import Operation from "../Operation.mjs";
|
||||||
|
import {defaultValue, MODEL_TO_MODULES} from "../lib/GPTTokenizer.mjs";
|
||||||
|
|
||||||
|
const pastelColors = [
|
||||||
|
"rgba(102,197,204,.4)",
|
||||||
|
"rgba(246,207,113,.4)",
|
||||||
|
"rgba(248,156,116,.4)",
|
||||||
|
"rgba(239,65,70,.4)",
|
||||||
|
"rgba(220,176,242,.4)",
|
||||||
|
"rgba(135,197,95,.4)",
|
||||||
|
"rgba(158,185,243,.4)",
|
||||||
|
"rgba(254,136,177,.4)",
|
||||||
|
"rgba(201,219,116,.4)",
|
||||||
|
"rgba(139,224,164,.4)",
|
||||||
|
"rgba(180,151,231,.4)",
|
||||||
|
];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Count AI Tokens operation
|
||||||
|
*/
|
||||||
|
class ParseAITokens extends Operation {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse AI Tokens constructor
|
||||||
|
*/
|
||||||
|
constructor() {
|
||||||
|
super();
|
||||||
|
|
||||||
|
this.name = "Parse AI Tokens";
|
||||||
|
this.module = "AI";
|
||||||
|
this.infoURL = "https://github.com/niieani/gpt-tokenizer";
|
||||||
|
this.description = "Parses the GPT tokens in the input text using niieani/gpt-tokenizer. Select the model to use the correct encoding.";
|
||||||
|
this.inputType = "string";
|
||||||
|
this.outputType = "html";
|
||||||
|
this.args = [
|
||||||
|
{
|
||||||
|
name: "Model",
|
||||||
|
type: "option",
|
||||||
|
value: Object.keys(MODEL_TO_MODULES),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Show Token IDs",
|
||||||
|
type: "boolean",
|
||||||
|
value: false
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {string} input
|
||||||
|
* @param {Object[]} args
|
||||||
|
* @returns {string}
|
||||||
|
*/
|
||||||
|
async run(input, args) {
|
||||||
|
if (!input) return "";
|
||||||
|
|
||||||
|
const [model, showIds] = args;
|
||||||
|
let fns;
|
||||||
|
if (MODEL_TO_MODULES[model]) {
|
||||||
|
fns = (await MODEL_TO_MODULES[model]());
|
||||||
|
} else {
|
||||||
|
// fallback to default (gpt-3.5-turbo encoding)
|
||||||
|
fns = (await MODEL_TO_MODULES[defaultValue]());
|
||||||
|
}
|
||||||
|
|
||||||
|
const encodedTokens = fns.encode(input); // IDs
|
||||||
|
|
||||||
|
let displayTokens = [];
|
||||||
|
if (showIds) {
|
||||||
|
displayTokens = encodedTokens.map((x)=> x.toString());
|
||||||
|
} else {
|
||||||
|
const tokens = [];
|
||||||
|
for (const token of fns.decodeGenerator(encodedTokens)) {
|
||||||
|
tokens.push(token);
|
||||||
|
}
|
||||||
|
displayTokens = tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
return this.format(input, displayTokens);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Format HTML
|
||||||
|
* @param {string} input
|
||||||
|
* @param {string[]} tokens
|
||||||
|
*/
|
||||||
|
format(input, tokens) {
|
||||||
|
|
||||||
|
const tokenHtml = tokens.map((t, i) => {
|
||||||
|
const tok =
|
||||||
|
t.replaceAll(" ", "\u00A0")
|
||||||
|
.replaceAll("\n", "<newline>");
|
||||||
|
|
||||||
|
const css = [
|
||||||
|
`background-color:${pastelColors[i % pastelColors.length]}`,
|
||||||
|
"padding: 0 0",
|
||||||
|
"border-radius: 3px",
|
||||||
|
"margin-right: 0",
|
||||||
|
"margin-bottom: 4px",
|
||||||
|
"display: 'inline-block'",
|
||||||
|
"height: 1.5em"
|
||||||
|
];
|
||||||
|
|
||||||
|
return `<span style="${css.join(";")}">${tok}</span>`;
|
||||||
|
});
|
||||||
|
|
||||||
|
return this.replaceSpacesOutsideTags(`
|
||||||
|
<div style="padding: 0; margin: 0">
|
||||||
|
<h1>Tokens</h1>
|
||||||
|
<p style="font-family: monospace">
|
||||||
|
${tokenHtml.join("")}
|
||||||
|
</p>
|
||||||
|
<hr />
|
||||||
|
<ul style="list-style: none; padding-left: 0">
|
||||||
|
<li><strong>Characters:</strong> ${input.length}</li>
|
||||||
|
<li><strong>Tokens:</strong> ${tokens.length}</li>
|
||||||
|
</ul>
|
||||||
|
</div>`
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Replace all space not starting within the HTML tag.
|
||||||
|
* @param {string} htmlString
|
||||||
|
* @returns {string}
|
||||||
|
*/
|
||||||
|
replaceSpacesOutsideTags(htmlString) {
|
||||||
|
return htmlString.replace(/(<[^>]*?>)|(\s+)/g, function(match, tag, spaces) {
|
||||||
|
if (tag) {
|
||||||
|
return tag;
|
||||||
|
} else if (spaces) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}).replace(/[\r\n]/g, "");
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
export default ParseAITokens;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue