From 6df8c1004f1fb6e522716922d9b31427bf00f126 Mon Sep 17 00:00:00 2001 From: "Glenn R. Martin" <222487+grmartin@users.noreply.github.com> Date: Tue, 3 Jun 2025 09:22:20 -0400 Subject: [PATCH 1/9] Adding AI Token Counter (cherry picked from commit 8b20dcbf1b52e3accc430655c2d1d94f058171fe) --- package-lock.json | 6 ++ package.json | 1 + src/core/operations/CountAITokens.mjs | 100 ++++++++++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100644 src/core/operations/CountAITokens.mjs diff --git a/package-lock.json b/package-lock.json index b374df4b..6ef3b9f1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -46,6 +46,7 @@ "file-saver": "^2.0.5", "flat": "^6.0.1", "geodesy": "1.1.3", + "gpt-tokenizer": "^2.9.0", "handlebars": "^4.7.8", "hash-wasm": "^4.12.0", "highlight.js": "^11.9.0", @@ -10361,6 +10362,11 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/gpt-tokenizer": { + "version": "2.9.0", + "resolved": "https://registry.npmjs.org/gpt-tokenizer/-/gpt-tokenizer-2.9.0.tgz", + "integrity": "sha512-YSpexBL/k4bfliAzMrRqn3M6+it02LutVyhVpDeMKrC/O9+pCe/5s8U2hYKa2vFLD5/vHhsKc8sOn/qGqII8Kg==" + }, "node_modules/graceful-fs": { "version": "4.2.11", "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", diff --git a/package.json b/package.json index 9191ab6f..aab5da6a 100644 --- a/package.json +++ b/package.json @@ -132,6 +132,7 @@ "file-saver": "^2.0.5", "flat": "^6.0.1", "geodesy": "1.1.3", + "gpt-tokenizer": "^2.9.0", "handlebars": "^4.7.8", "hash-wasm": "^4.12.0", "highlight.js": "^11.9.0", diff --git a/src/core/operations/CountAITokens.mjs b/src/core/operations/CountAITokens.mjs new file mode 100644 index 00000000..ecba26ee --- /dev/null +++ b/src/core/operations/CountAITokens.mjs @@ -0,0 +1,100 @@ +/** + * @author grmartin [grmartin] + * @copyright Crown Copyright 2016 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; + +// This mapping returns a Promise that resolves to the correct countTokens function for the model. +const MODEL_TO_COUNT_TOKENS = { + // cl100k_base models + "gpt-4": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-4-32k": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-4-turbo": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-4o": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-4-0125-preview": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-4-1106-preview": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-3.5-turbo": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-3.5-turbo-16k": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-3.5-turbo-instruct": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-3.5-turbo-0125": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-3.5-turbo-1106": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "text-embedding-ada-002": () => import("gpt-tokenizer/model/text-embedding-ada-002").then(m => m.countTokens), + "text-embedding-3-large": () => import("gpt-tokenizer/model/text-embedding-3-large").then(m => m.countTokens), + "text-embedding-3-small": () => import("gpt-tokenizer/model/text-embedding-3-small").then(m => m.countTokens), + + // p50k_base models + "code-davinci-002": () => import("gpt-tokenizer/model/code-davinci-002").then(m => m.countTokens), + "code-davinci-001": () => import("gpt-tokenizer/model/code-davinci-002").then(m => m.countTokens), + "code-cushman-002": () => import("gpt-tokenizer/model/code-cushman-002").then(m => m.countTokens), + "code-cushman-001": () => import("gpt-tokenizer/model/code-cushman-002").then(m => m.countTokens), + "text-davinci-002": () => import("gpt-tokenizer/model/text-davinci-002").then(m => m.countTokens), + "text-davinci-003": () => import("gpt-tokenizer/model/text-davinci-003").then(m => m.countTokens), + + // p50k_edit models + "text-davinci-edit-001": () => import("gpt-tokenizer/model/text-davinci-edit-001").then(m => m.countTokens), + "code-davinci-edit-001": () => import("gpt-tokenizer/model/code-davinci-edit-001").then(m => m.countTokens), + + // r50k_base models + "davinci": () => import("gpt-tokenizer/model/davinci").then(m => m.countTokens), + "curie": () => import("gpt-tokenizer/model/curie").then(m => m.countTokens), + "babbage": () => import("gpt-tokenizer/model/babbage").then(m => m.countTokens), + "ada": () => import("gpt-tokenizer/model/ada").then(m => m.countTokens), +}; + + +/** + * Count AI Tokens operation + */ +class CountAITokens extends Operation { + + /** + * Count AI Tokens constructor + */ + constructor() { + super(); + + this.name = "Count AI Tokens"; + this.module = "AI"; + this.infoURL = "https://github.com/niieani/gpt-tokenizer"; + this.description = "Counts the number of GPT tokens in the input text using niieani/gpt-tokenizer. Select the model to use the correct encoding."; + this.inputType = "string"; + this.outputType = "string"; + this.args = [ + { + name: "Model", + type: "option", + value: Object.keys(MODEL_TO_COUNT_TOKENS), + } + ]; + } + + /** + * @param {string} input + * @param {Object[]} args + * @returns {string} + */ + async run(input, args) { + if (!input) return ""; + // const [model] = args; + // // Use the mapping, fallback to cl100k_base if not found + // const encoding = MODEL_TO_ENCODING[model] || cl100k_base; + // const tokenCount = encoding.; + // return tokenCount.toString(); + const [model] = args; + let countTokensFn; + if (MODEL_TO_COUNT_TOKENS[model]) { + countTokensFn = await MODEL_TO_COUNT_TOKENS[model](); + } else { + // fallback to default (gpt-3.5-turbo encoding) + countTokensFn = (await import("gpt-tokenizer/model/gpt-3.5-turbo")).countTokens; + } + const tokenCount = countTokensFn(input); + return tokenCount.toString(); + } + +} + +export default CountAITokens; + From 233eb3d452699a3bef117e6187cd361cf6cc6624 Mon Sep 17 00:00:00 2001 From: "Glenn R. Martin" <222487+grmartin@users.noreply.github.com> Date: Tue, 3 Jun 2025 10:44:23 -0400 Subject: [PATCH 2/9] Adding AI Category (cherry picked from commit 83381be9c7346467919620ecee7a1ba2eb058811) --- src/core/config/Categories.json | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index 434c8bb6..8d835921 100644 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -3,6 +3,10 @@ "name": "Favourites", "ops": [] }, + { + "name": "AI", + "ops": ["Count AI Tokens"] + }, { "name": "Data format", "ops": [ From 0df7ac0bad9566852b5e2b66ca914330be671d6d Mon Sep 17 00:00:00 2001 From: "Glenn R. Martin" <222487+grmartin@users.noreply.github.com> Date: Tue, 3 Jun 2025 21:17:53 -0400 Subject: [PATCH 3/9] Making the tokenizer in to a library. --- src/core/lib/GPTTokenizer.mjs | 191 ++++++++++++++++++++++++++ src/core/operations/CountAITokens.mjs | 49 +------ 2 files changed, 197 insertions(+), 43 deletions(-) create mode 100644 src/core/lib/GPTTokenizer.mjs diff --git a/src/core/lib/GPTTokenizer.mjs b/src/core/lib/GPTTokenizer.mjs new file mode 100644 index 00000000..ba03727c --- /dev/null +++ b/src/core/lib/GPTTokenizer.mjs @@ -0,0 +1,191 @@ +// noinspection SpellCheckingInspection + +/** + * @author grmartin [grmartin@engineer.com] + * @copyright Crown Copyright 2016 + * @license Apache-2.0 + */ + +/** + * Convert an imported module in to a solid type + * @param m an imported module + * @returns {TokenizerModule} + */ +const exportModule = (m) => { + return { + countTokens: m.countTokens, // # of tokens + encode: m.encode, // tokens + decode: m.decode, // token ids + }; +}; + +export const defaultValue = Symbol("*"); + +// Tokenizer module constants +/** + * @returns {Promise} + * @constructor + */ +const GPT_35_TURBO_TOKENIZER = () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => exportModule(m)); + +/** + * @returns {Promise} + * @constructor + */ +const TEXT_EMBEDDING_ADA_002_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-ada-002").then(m => exportModule(m)); + +/** + * @returns {Promise} + * @constructor + */ +const TEXT_EMBEDDING_3_LARGE_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-large").then(m => exportModule(m)); + +/** + * @returns {Promise} + * @constructor + */ +const TEXT_EMBEDDING_3_SMALL_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-small").then(m => exportModule(m)); + +/** + * @returns {Promise} + * @constructor + */ +const CODE_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-002").then(m => exportModule(m)); + +/** + * @returns {Promise} + * @constructor + */ +const CODE_CUSHMAN_002_TOKENIZER = () => import("gpt-tokenizer/model/code-cushman-002").then(m => exportModule(m)); + +/** + * @returns {Promise} + * @constructor + */ +const TEXT_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-002").then(m => exportModule(m)); + +/** + * @returns {Promise} + * @constructor + */ +const TEXT_DAVINCI_003_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-003").then(m => exportModule(m)); + +/** + * @returns {Promise} + * @constructor + */ +const TEXT_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-edit-001").then(m => exportModule(m)); + +/** + * @returns {Promise} + * @constructor + */ +const CODE_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-edit-001").then(m => exportModule(m)); + +/** + * @returns {Promise} + * @constructor + */ +const DAVINCI_TOKENIZER = () => import("gpt-tokenizer/model/davinci").then(m => exportModule(m)); + +/** + * @returns {Promise} + * @constructor + */ +const CURIE_TOKENIZER = () => import("gpt-tokenizer/model/curie").then(m => exportModule(m)); + +/** + * @returns {Promise} + * @constructor + */ +const BABBAGE_TOKENIZER = () => import("gpt-tokenizer/model/babbage").then(m => exportModule(m)); + +/** + * @returns {Promise} + * @constructor + */ +const ADA_TOKENIZER = () => import("gpt-tokenizer/model/ada").then(m => exportModule(m)); + +// This mapping returns a Promise that resolves to the correct countTokens function for the model. +export const MODEL_TO_MODULES = { + // cl100k_base models + [defaultValue]: GPT_35_TURBO_TOKENIZER, + "gpt-4": GPT_35_TURBO_TOKENIZER, + "gpt-4-32k": GPT_35_TURBO_TOKENIZER, + "gpt-4-turbo": GPT_35_TURBO_TOKENIZER, + "gpt-4o": GPT_35_TURBO_TOKENIZER, + "gpt-4-0125-preview": GPT_35_TURBO_TOKENIZER, + "gpt-4-1106-preview": GPT_35_TURBO_TOKENIZER, + "gpt-3.5-turbo": GPT_35_TURBO_TOKENIZER, + "gpt-3.5-turbo-16k": GPT_35_TURBO_TOKENIZER, + "gpt-3.5-turbo-instruct": GPT_35_TURBO_TOKENIZER, + "gpt-3.5-turbo-0125": GPT_35_TURBO_TOKENIZER, + "gpt-3.5-turbo-1106": GPT_35_TURBO_TOKENIZER, + "text-embedding-ada-002": TEXT_EMBEDDING_ADA_002_TOKENIZER, + "text-embedding-3-large": TEXT_EMBEDDING_3_LARGE_TOKENIZER, + "text-embedding-3-small": TEXT_EMBEDDING_3_SMALL_TOKENIZER, + + // p50k_base models + "code-davinci-002": CODE_DAVINCI_002_TOKENIZER, + "code-davinci-001": CODE_DAVINCI_002_TOKENIZER, + "code-cushman-002": CODE_CUSHMAN_002_TOKENIZER, + "code-cushman-001": CODE_CUSHMAN_002_TOKENIZER, + "text-davinci-002": TEXT_DAVINCI_002_TOKENIZER, + "text-davinci-003": TEXT_DAVINCI_003_TOKENIZER, + + // p50k_edit models + "text-davinci-edit-001": TEXT_DAVINCI_EDIT_001_TOKENIZER, + "code-davinci-edit-001": CODE_DAVINCI_EDIT_001_TOKENIZER, + + // r50k_base models + "davinci": DAVINCI_TOKENIZER, + "curie": CURIE_TOKENIZER, + "babbage": BABBAGE_TOKENIZER, + "ada": ADA_TOKENIZER, +}; + +/** + * @typedef {Object} EncodeOptions + * @property {Set|'all'} [allowedSpecial] - A list of special tokens that are allowed in the input. + * If set to 'all', all special tokens are allowed except those in disallowedSpecial. + * @default undefined + * @property {Set|'all'} [disallowedSpecial] - A list of special tokens that are disallowed in the input. + * If set to 'all', all special tokens are disallowed except those in allowedSpecial. + * @default 'all' + */ + +/** + * @typedef {Object} ChatMessage + * @property {'system'|'user'|'assistant'} [role] - The role of the message sender. + * @property {string} [name] - The name of the message sender. + * @property {string} content - The content of the message. + */ + +/** + * @func EncodeFn + * @param {string} lineToEncode - The string to encode. + * @param {EncodeOptions} [encodeOptions] - Optional encoding options. + * @returns {number[]} An array of numbers representing the encoded result. + */ + +/** + * @func DecodeFn + * @param {Iterable} inputTokensToDecode - An iterable collection of numbers to decode. + * @returns {string} The decoded string. + */ + +/** + * A function that counts tokens. + * + * @func CountTokensFn + * @param {string | Iterable} input - The input string or an iterable of ChatMessage objects. + * @param {EncodeOptions} [encodeOptions] - Optional encoding options to customize the token counting process. + * @returns {number} The total number of tokens counted. + */ + +/** + * @typedef {Object} TokenizerModule + * @property {CountTokensFn} countTokens - Function to count tokens in input + * @property {DecodeFn} decode - Function to convert token IDs back to text + * @property {EncodeFn} encode - Function to convert text to token IDs + */ diff --git a/src/core/operations/CountAITokens.mjs b/src/core/operations/CountAITokens.mjs index ecba26ee..c1876fba 100644 --- a/src/core/operations/CountAITokens.mjs +++ b/src/core/operations/CountAITokens.mjs @@ -1,48 +1,11 @@ /** - * @author grmartin [grmartin] + * @author grmartin [grmartin@engineer.com] * @copyright Crown Copyright 2016 * @license Apache-2.0 */ import Operation from "../Operation.mjs"; - -// This mapping returns a Promise that resolves to the correct countTokens function for the model. -const MODEL_TO_COUNT_TOKENS = { - // cl100k_base models - "gpt-4": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), - "gpt-4-32k": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), - "gpt-4-turbo": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), - "gpt-4o": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), - "gpt-4-0125-preview": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), - "gpt-4-1106-preview": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), - "gpt-3.5-turbo": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), - "gpt-3.5-turbo-16k": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), - "gpt-3.5-turbo-instruct": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), - "gpt-3.5-turbo-0125": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), - "gpt-3.5-turbo-1106": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), - "text-embedding-ada-002": () => import("gpt-tokenizer/model/text-embedding-ada-002").then(m => m.countTokens), - "text-embedding-3-large": () => import("gpt-tokenizer/model/text-embedding-3-large").then(m => m.countTokens), - "text-embedding-3-small": () => import("gpt-tokenizer/model/text-embedding-3-small").then(m => m.countTokens), - - // p50k_base models - "code-davinci-002": () => import("gpt-tokenizer/model/code-davinci-002").then(m => m.countTokens), - "code-davinci-001": () => import("gpt-tokenizer/model/code-davinci-002").then(m => m.countTokens), - "code-cushman-002": () => import("gpt-tokenizer/model/code-cushman-002").then(m => m.countTokens), - "code-cushman-001": () => import("gpt-tokenizer/model/code-cushman-002").then(m => m.countTokens), - "text-davinci-002": () => import("gpt-tokenizer/model/text-davinci-002").then(m => m.countTokens), - "text-davinci-003": () => import("gpt-tokenizer/model/text-davinci-003").then(m => m.countTokens), - - // p50k_edit models - "text-davinci-edit-001": () => import("gpt-tokenizer/model/text-davinci-edit-001").then(m => m.countTokens), - "code-davinci-edit-001": () => import("gpt-tokenizer/model/code-davinci-edit-001").then(m => m.countTokens), - - // r50k_base models - "davinci": () => import("gpt-tokenizer/model/davinci").then(m => m.countTokens), - "curie": () => import("gpt-tokenizer/model/curie").then(m => m.countTokens), - "babbage": () => import("gpt-tokenizer/model/babbage").then(m => m.countTokens), - "ada": () => import("gpt-tokenizer/model/ada").then(m => m.countTokens), -}; - +import {defaultValue, MODEL_TO_MODULES} from "../lib/GPTTokenizer.mjs"; /** * Count AI Tokens operation @@ -65,7 +28,7 @@ class CountAITokens extends Operation { { name: "Model", type: "option", - value: Object.keys(MODEL_TO_COUNT_TOKENS), + value: Object.keys(MODEL_TO_MODULES), } ]; } @@ -84,11 +47,11 @@ class CountAITokens extends Operation { // return tokenCount.toString(); const [model] = args; let countTokensFn; - if (MODEL_TO_COUNT_TOKENS[model]) { - countTokensFn = await MODEL_TO_COUNT_TOKENS[model](); + if (MODEL_TO_MODULES[model]) { + countTokensFn = (await MODEL_TO_MODULES[model]()).countTokens; } else { // fallback to default (gpt-3.5-turbo encoding) - countTokensFn = (await import("gpt-tokenizer/model/gpt-3.5-turbo")).countTokens; + countTokensFn = (await MODEL_TO_MODULES[defaultValue]()).countTokens; } const tokenCount = countTokensFn(input); return tokenCount.toString(); From 8443330abd94ff111dddeea568866de64fe92a1d Mon Sep 17 00:00:00 2001 From: "Glenn R. Martin" <222487+grmartin@users.noreply.github.com> Date: Tue, 3 Jun 2025 21:19:28 -0400 Subject: [PATCH 4/9] Removing some unneeded comments --- src/core/operations/CountAITokens.mjs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/core/operations/CountAITokens.mjs b/src/core/operations/CountAITokens.mjs index c1876fba..72b92090 100644 --- a/src/core/operations/CountAITokens.mjs +++ b/src/core/operations/CountAITokens.mjs @@ -40,11 +40,7 @@ class CountAITokens extends Operation { */ async run(input, args) { if (!input) return ""; - // const [model] = args; - // // Use the mapping, fallback to cl100k_base if not found - // const encoding = MODEL_TO_ENCODING[model] || cl100k_base; - // const tokenCount = encoding.; - // return tokenCount.toString(); + const [model] = args; let countTokensFn; if (MODEL_TO_MODULES[model]) { From dd583a4943826943171e3db2afff39fbab8d68ce Mon Sep 17 00:00:00 2001 From: "Glenn R. Martin" <222487+grmartin@users.noreply.github.com> Date: Sun, 8 Jun 2025 19:17:23 -0400 Subject: [PATCH 5/9] Adding GPT Token Parser display --- src/core/config/Categories.json | 2 +- src/core/lib/GPTTokenizer.mjs | 124 +--------------------- src/core/operations/ParseAITokens.mjs | 147 ++++++++++++++++++++++++++ 3 files changed, 150 insertions(+), 123 deletions(-) create mode 100644 src/core/operations/ParseAITokens.mjs diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index 8d835921..010e72a1 100644 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -5,7 +5,7 @@ }, { "name": "AI", - "ops": ["Count AI Tokens"] + "ops": ["Count AI Tokens", "Parse AI Tokens"] }, { "name": "Data format", diff --git a/src/core/lib/GPTTokenizer.mjs b/src/core/lib/GPTTokenizer.mjs index ba03727c..e2c57f0a 100644 --- a/src/core/lib/GPTTokenizer.mjs +++ b/src/core/lib/GPTTokenizer.mjs @@ -6,104 +6,30 @@ * @license Apache-2.0 */ -/** - * Convert an imported module in to a solid type - * @param m an imported module - * @returns {TokenizerModule} - */ const exportModule = (m) => { return { countTokens: m.countTokens, // # of tokens - encode: m.encode, // tokens - decode: m.decode, // token ids + encode: m.encode, // tokens ids + decodeGenerator: m.decodeGenerator, // tokens }; }; export const defaultValue = Symbol("*"); // Tokenizer module constants -/** - * @returns {Promise} - * @constructor - */ const GPT_35_TURBO_TOKENIZER = () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => exportModule(m)); - -/** - * @returns {Promise} - * @constructor - */ const TEXT_EMBEDDING_ADA_002_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-ada-002").then(m => exportModule(m)); - -/** - * @returns {Promise} - * @constructor - */ const TEXT_EMBEDDING_3_LARGE_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-large").then(m => exportModule(m)); - -/** - * @returns {Promise} - * @constructor - */ const TEXT_EMBEDDING_3_SMALL_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-small").then(m => exportModule(m)); - -/** - * @returns {Promise} - * @constructor - */ const CODE_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-002").then(m => exportModule(m)); - -/** - * @returns {Promise} - * @constructor - */ const CODE_CUSHMAN_002_TOKENIZER = () => import("gpt-tokenizer/model/code-cushman-002").then(m => exportModule(m)); - -/** - * @returns {Promise} - * @constructor - */ const TEXT_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-002").then(m => exportModule(m)); - -/** - * @returns {Promise} - * @constructor - */ const TEXT_DAVINCI_003_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-003").then(m => exportModule(m)); - -/** - * @returns {Promise} - * @constructor - */ const TEXT_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-edit-001").then(m => exportModule(m)); - -/** - * @returns {Promise} - * @constructor - */ const CODE_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-edit-001").then(m => exportModule(m)); - -/** - * @returns {Promise} - * @constructor - */ const DAVINCI_TOKENIZER = () => import("gpt-tokenizer/model/davinci").then(m => exportModule(m)); - -/** - * @returns {Promise} - * @constructor - */ const CURIE_TOKENIZER = () => import("gpt-tokenizer/model/curie").then(m => exportModule(m)); - -/** - * @returns {Promise} - * @constructor - */ const BABBAGE_TOKENIZER = () => import("gpt-tokenizer/model/babbage").then(m => exportModule(m)); - -/** - * @returns {Promise} - * @constructor - */ const ADA_TOKENIZER = () => import("gpt-tokenizer/model/ada").then(m => exportModule(m)); // This mapping returns a Promise that resolves to the correct countTokens function for the model. @@ -143,49 +69,3 @@ export const MODEL_TO_MODULES = { "babbage": BABBAGE_TOKENIZER, "ada": ADA_TOKENIZER, }; - -/** - * @typedef {Object} EncodeOptions - * @property {Set|'all'} [allowedSpecial] - A list of special tokens that are allowed in the input. - * If set to 'all', all special tokens are allowed except those in disallowedSpecial. - * @default undefined - * @property {Set|'all'} [disallowedSpecial] - A list of special tokens that are disallowed in the input. - * If set to 'all', all special tokens are disallowed except those in allowedSpecial. - * @default 'all' - */ - -/** - * @typedef {Object} ChatMessage - * @property {'system'|'user'|'assistant'} [role] - The role of the message sender. - * @property {string} [name] - The name of the message sender. - * @property {string} content - The content of the message. - */ - -/** - * @func EncodeFn - * @param {string} lineToEncode - The string to encode. - * @param {EncodeOptions} [encodeOptions] - Optional encoding options. - * @returns {number[]} An array of numbers representing the encoded result. - */ - -/** - * @func DecodeFn - * @param {Iterable} inputTokensToDecode - An iterable collection of numbers to decode. - * @returns {string} The decoded string. - */ - -/** - * A function that counts tokens. - * - * @func CountTokensFn - * @param {string | Iterable} input - The input string or an iterable of ChatMessage objects. - * @param {EncodeOptions} [encodeOptions] - Optional encoding options to customize the token counting process. - * @returns {number} The total number of tokens counted. - */ - -/** - * @typedef {Object} TokenizerModule - * @property {CountTokensFn} countTokens - Function to count tokens in input - * @property {DecodeFn} decode - Function to convert token IDs back to text - * @property {EncodeFn} encode - Function to convert text to token IDs - */ diff --git a/src/core/operations/ParseAITokens.mjs b/src/core/operations/ParseAITokens.mjs new file mode 100644 index 00000000..b56d85d8 --- /dev/null +++ b/src/core/operations/ParseAITokens.mjs @@ -0,0 +1,147 @@ +/** + * @author grmartin [grmartin@engineer.com] + * @copyright Crown Copyright 2016 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; +import {defaultValue, MODEL_TO_MODULES} from "../lib/GPTTokenizer.mjs"; + +const pastelColors = [ + "rgba(102,197,204,.4)", + "rgba(246,207,113,.4)", + "rgba(248,156,116,.4)", + "rgba(239,65,70,.4)", + "rgba(220,176,242,.4)", + "rgba(135,197,95,.4)", + "rgba(158,185,243,.4)", + "rgba(254,136,177,.4)", + "rgba(201,219,116,.4)", + "rgba(139,224,164,.4)", + "rgba(180,151,231,.4)", +]; + +/** + * Count AI Tokens operation + */ +class ParseAITokens extends Operation { + + /** + * Parse AI Tokens constructor + */ + constructor() { + super(); + + this.name = "Parse AI Tokens"; + this.module = "AI"; + this.infoURL = "https://github.com/niieani/gpt-tokenizer"; + this.description = "Parses the GPT tokens in the input text using niieani/gpt-tokenizer. Select the model to use the correct encoding."; + this.inputType = "string"; + this.outputType = "html"; + this.args = [ + { + name: "Model", + type: "option", + value: Object.keys(MODEL_TO_MODULES), + }, + { + name: "Show Token IDs", + type: "boolean", + value: false + } + ]; + } + + /** + * @param {string} input + * @param {Object[]} args + * @returns {string} + */ + async run(input, args) { + if (!input) return ""; + + const [model, showIds] = args; + let fns; + if (MODEL_TO_MODULES[model]) { + fns = (await MODEL_TO_MODULES[model]()); + } else { + // fallback to default (gpt-3.5-turbo encoding) + fns = (await MODEL_TO_MODULES[defaultValue]()); + } + + const encodedTokens = fns.encode(input); // IDs + + let displayTokens = []; + if (showIds) { + displayTokens = encodedTokens.map((x)=> x.toString()); + } else { + const tokens = []; + for (const token of fns.decodeGenerator(encodedTokens)) { + tokens.push(token); + } + displayTokens = tokens; + } + + return this.format(input, displayTokens); + + }; + + /** + * Format HTML + * @param {string} input + * @param {string[]} tokens + */ + format(input, tokens) { + + const tokenHtml = tokens.map((t, i) => { + const tok = + t.replaceAll(" ", "\u00A0") + .replaceAll("\n", ""); + + const css = [ + `background-color:${pastelColors[i % pastelColors.length]}`, + "padding: 0 0", + "border-radius: 3px", + "margin-right: 0", + "margin-bottom: 4px", + "display: 'inline-block'", + "height: 1.5em" + ]; + + return `${tok}`; + }); + + return this.replaceSpacesOutsideTags(` +
+

Tokens

+

+ ${tokenHtml.join("")} +

+
+
    +
  • Characters: ${input.length}
  • +
  • Tokens: ${tokens.length}
  • +
+
` + ); + }; + + /** + * Replace all space not starting within the HTML tag. + * @param {string} htmlString + * @returns {string} + */ + replaceSpacesOutsideTags(htmlString) { + return htmlString.replace(/(<[^>]*?>)|(\s+)/g, function(match, tag, spaces) { + if (tag) { + return tag; + } else if (spaces) { + return ""; + } + }).replace(/[\r\n]/g, ""); + }; + +} + +export default ParseAITokens; + From 0b913d070a87bb7f02666610a7e41c1b45133dd1 Mon Sep 17 00:00:00 2001 From: "Glenn R. Martin" <222487+grmartin@users.noreply.github.com> Date: Mon, 9 Jun 2025 00:02:01 -0400 Subject: [PATCH 6/9] Encoding HTML entities as well as ensuring no script tags slip by --- src/core/operations/ParseAITokens.mjs | 35 +++++++++++++++++---------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/core/operations/ParseAITokens.mjs b/src/core/operations/ParseAITokens.mjs index b56d85d8..06d4c5f6 100644 --- a/src/core/operations/ParseAITokens.mjs +++ b/src/core/operations/ParseAITokens.mjs @@ -95,7 +95,8 @@ class ParseAITokens extends Operation { const tokenHtml = tokens.map((t, i) => { const tok = - t.replaceAll(" ", "\u00A0") + t.replace(/[\u00A0-\u9999<>&]/g, i => "&#"+i.charCodeAt(0)+";") + .replaceAll(" ", "\u00A0") .replaceAll("\n", ""); const css = [ @@ -127,21 +128,29 @@ class ParseAITokens extends Operation { }; /** - * Replace all space not starting within the HTML tag. - * @param {string} htmlString - * @returns {string} + * Replace spaces outside HTML tags and sanitize