Adding GPT Token Parser display

2025-06-14 10:14:53 -04:00 · 2025-06-08 19:17:23 -04:00 · 2025-06-08 19:17:23 -04:00 · dd583a4943
commit dd583a4943
parent 8443330abd
3 changed files with 150 additions and 123 deletions
--- a/src/core/config/Categories.json
+++ b/src/core/config/Categories.json
@ -5,7 +5,7 @@
    },
    {
        "name": "AI",
-        "ops": ["Count AI Tokens"]
+        "ops": ["Count AI Tokens", "Parse AI Tokens"]
    },
    {
        "name": "Data format",
--- a/src/core/lib/GPTTokenizer.mjs
+++ b/src/core/lib/GPTTokenizer.mjs
@ -6,104 +6,30 @@
 * @license Apache-2.0
 */

-/**
- * Convert an imported module in to a solid type
- * @param m an imported module
- * @returns {TokenizerModule}
- */
 const exportModule = (m) => {
    return {
        countTokens: m.countTokens, // # of tokens
-        encode: m.encode,           // tokens
-        decode: m.decode,           // token ids
+        encode: m.encode,           // tokens ids
+        decodeGenerator: m.decodeGenerator, // tokens
    };
 };

 export const defaultValue = Symbol("*");

 // Tokenizer module constants
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const GPT_35_TURBO_TOKENIZER = () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => exportModule(m));
-
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const TEXT_EMBEDDING_ADA_002_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-ada-002").then(m => exportModule(m));
-
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const TEXT_EMBEDDING_3_LARGE_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-large").then(m => exportModule(m));
-
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const TEXT_EMBEDDING_3_SMALL_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-small").then(m => exportModule(m));
-
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const CODE_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-002").then(m => exportModule(m));
-
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const CODE_CUSHMAN_002_TOKENIZER = () => import("gpt-tokenizer/model/code-cushman-002").then(m => exportModule(m));
-
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const TEXT_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-002").then(m => exportModule(m));
-
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const TEXT_DAVINCI_003_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-003").then(m => exportModule(m));
-
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const TEXT_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-edit-001").then(m => exportModule(m));
-
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const CODE_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-edit-001").then(m => exportModule(m));
-
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const DAVINCI_TOKENIZER = () => import("gpt-tokenizer/model/davinci").then(m => exportModule(m));
-
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const CURIE_TOKENIZER = () => import("gpt-tokenizer/model/curie").then(m => exportModule(m));
-
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const BABBAGE_TOKENIZER = () => import("gpt-tokenizer/model/babbage").then(m => exportModule(m));
-
-/**
- * @returns {Promise<TokenizerModule>}
- * @constructor
- */
 const ADA_TOKENIZER = () => import("gpt-tokenizer/model/ada").then(m => exportModule(m));

 // This mapping returns a Promise that resolves to the correct countTokens function for the model.
@ -143,49 +69,3 @@ export const MODEL_TO_MODULES = {
    "babbage": BABBAGE_TOKENIZER,
    "ada": ADA_TOKENIZER,
 };
-
-/**
- * @typedef {Object} EncodeOptions
- * @property {Set<string>|'all'} [allowedSpecial] - A list of special tokens that are allowed in the input.
- * If set to 'all', all special tokens are allowed except those in disallowedSpecial.
- * @default undefined
- * @property {Set<string>|'all'} [disallowedSpecial] - A list of special tokens that are disallowed in the input.
- * If set to 'all', all special tokens are disallowed except those in allowedSpecial.
- * @default 'all'
- */
-
-/**
- * @typedef {Object} ChatMessage
- * @property {'system'|'user'|'assistant'} [role] - The role of the message sender.
- * @property {string} [name] - The name of the message sender.
- * @property {string} content - The content of the message.
- */
-
-/**
- * @func EncodeFn
- * @param {string} lineToEncode - The string to encode.
- * @param {EncodeOptions} [encodeOptions] - Optional encoding options.
- * @returns {number[]} An array of numbers representing the encoded result.
- */
-
-/**
- * @func DecodeFn
- * @param {Iterable<number>} inputTokensToDecode - An iterable collection of numbers to decode.
- * @returns {string} The decoded string.
- */
-
-/**
- * A function that counts tokens.
- *
- * @func CountTokensFn
- * @param {string | Iterable<ChatMessage>} input - The input string or an iterable of ChatMessage objects.
- * @param {EncodeOptions} [encodeOptions] - Optional encoding options to customize the token counting process.
- * @returns {number} The total number of tokens counted.
- */
-
-/**
- * @typedef {Object} TokenizerModule
- * @property {CountTokensFn} countTokens - Function to count tokens in input
- * @property {DecodeFn} decode - Function to convert token IDs back to text
- * @property {EncodeFn} encode - Function to convert text to token IDs
- */
--- a/src/core/operations/ParseAITokens.mjs
+++ b/src/core/operations/ParseAITokens.mjs
@ -0,0 +1,147 @@
+/**
+ * @author grmartin [grmartin@engineer.com]
+ * @copyright Crown Copyright 2016
+ * @license Apache-2.0
+ */
+
+import Operation from "../Operation.mjs";
+import {defaultValue, MODEL_TO_MODULES} from "../lib/GPTTokenizer.mjs";
+
+const pastelColors = [
+    "rgba(102,197,204,.4)",
+    "rgba(246,207,113,.4)",
+    "rgba(248,156,116,.4)",
+    "rgba(239,65,70,.4)",
+    "rgba(220,176,242,.4)",
+    "rgba(135,197,95,.4)",
+    "rgba(158,185,243,.4)",
+    "rgba(254,136,177,.4)",
+    "rgba(201,219,116,.4)",
+    "rgba(139,224,164,.4)",
+    "rgba(180,151,231,.4)",
+];
+
+/**
+ * Count AI Tokens operation
+ */
+class ParseAITokens extends Operation {
+
+    /**
+     * Parse AI Tokens constructor
+     */
+    constructor() {
+        super();
+
+        this.name = "Parse AI Tokens";
+        this.module = "AI";
+        this.infoURL = "https://github.com/niieani/gpt-tokenizer";
+        this.description = "Parses the GPT tokens in the input text using niieani/gpt-tokenizer. Select the model to use the correct encoding.";
+        this.inputType = "string";
+        this.outputType = "html";
+        this.args = [
+            {
+                name: "Model",
+                type: "option",
+                value: Object.keys(MODEL_TO_MODULES),
+            },
+            {
+                name: "Show Token IDs",
+                type: "boolean",
+                value: false
+            }
+        ];
+    }
+
+    /**
+     * @param {string} input
+     * @param {Object[]} args
+     * @returns {string}
+     */
+    async run(input, args) {
+        if (!input) return "";
+
+        const [model, showIds] = args;
+        let fns;
+        if (MODEL_TO_MODULES[model]) {
+            fns = (await MODEL_TO_MODULES[model]());
+        } else {
+            // fallback to default (gpt-3.5-turbo encoding)
+            fns = (await MODEL_TO_MODULES[defaultValue]());
+        }
+
+        const encodedTokens = fns.encode(input); // IDs
+
+        let displayTokens = [];
+        if (showIds) {
+            displayTokens = encodedTokens.map((x)=> x.toString());
+        } else {
+            const tokens = [];
+            for (const token of fns.decodeGenerator(encodedTokens)) {
+                tokens.push(token);
+            }
+            displayTokens = tokens;
+        }
+
+        return this.format(input, displayTokens);
+
+    };
+
+    /**
+     * Format HTML
+     * @param {string} input
+     * @param {string[]} tokens
+     */
+    format(input, tokens) {
+
+        const tokenHtml = tokens.map((t, i) => {
+            const tok =
+                t.replaceAll(" ", "\u00A0")
+                    .replaceAll("\n", "<newline>");
+
+            const css = [
+                `background-color:${pastelColors[i % pastelColors.length]}`,
+                "padding: 0 0",
+                "border-radius: 3px",
+                "margin-right: 0",
+                "margin-bottom: 4px",
+                "display: 'inline-block'",
+                "height: 1.5em"
+            ];
+
+            return `<span style="${css.join(";")}">${tok}</span>`;
+        });
+
+        return this.replaceSpacesOutsideTags(`
+            <div style="padding: 0; margin: 0">
+                <h1>Tokens</h1>
+                <p style="font-family: monospace">
+                    ${tokenHtml.join("")}
+                </p>
+                <hr />
+                <ul style="list-style: none; padding-left: 0">
+                    <li><strong>Characters:</strong>&nbsp;${input.length}</li>
+                    <li><strong>Tokens:</strong>&nbsp;${tokens.length}</li>
+                </ul>
+            </div>`
+        );
+    };
+
+    /**
+     * Replace all space not starting within the HTML tag.
+     * @param {string} htmlString
+     * @returns {string}
+     */
+    replaceSpacesOutsideTags(htmlString) {
+        return htmlString.replace(/(<[^>]*?>)|(\s+)/g, function(match, tag, spaces) {
+            if (tag) {
+                return tag;
+            } else if (spaces) {
+                return "";
+            }
+        }).replace(/[\r\n]/g, "");
+    };
+
+}
+
+export default ParseAITokens;
+