Adding GPT Token Parser display

2025-06-14 10:14:53 -04:00 · 2025-06-08 19:17:23 -04:00 · 2025-06-08 19:17:23 -04:00 · dd583a4943
commit dd583a4943
parent 8443330abd
3 changed files with 150 additions and 123 deletions
--- a/src/core/config/Categories.json
+++ b/src/core/config/Categories.json
@ -5,7 +5,7 @@
    },
    {
        "name": "AI",
-        "ops": ["Count AI Tokens"]
+        "ops": ["Count AI Tokens", "Parse AI Tokens"]
    },
    {
        "name": "Data format",
--- a/src/core/lib/GPTTokenizer.mjs
+++ b/src/core/lib/GPTTokenizer.mjs
@ -6,104 +6,30 @@
 * @license Apache-2.0
 */
 /**
 * Convert an imported module in to a solid type
 * @param m an imported module
 * @returns {TokenizerModule}
 */
 const exportModule = (m) => {
    return {
        countTokens: m.countTokens, // # of tokens
-        encode: m.encode,           // tokens
+        encode: m.encode,           // tokens ids
-        decode: m.decode,           // token ids
+        decodeGenerator: m.decodeGenerator, // tokens
    };
 };
 export const defaultValue = Symbol("*");
 // Tokenizer module constants
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const GPT_35_TURBO_TOKENIZER = () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => exportModule(m));
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const TEXT_EMBEDDING_ADA_002_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-ada-002").then(m => exportModule(m));
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const TEXT_EMBEDDING_3_LARGE_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-large").then(m => exportModule(m));
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const TEXT_EMBEDDING_3_SMALL_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-small").then(m => exportModule(m));
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const CODE_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-002").then(m => exportModule(m));
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const CODE_CUSHMAN_002_TOKENIZER = () => import("gpt-tokenizer/model/code-cushman-002").then(m => exportModule(m));
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const TEXT_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-002").then(m => exportModule(m));
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const TEXT_DAVINCI_003_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-003").then(m => exportModule(m));
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const TEXT_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-edit-001").then(m => exportModule(m));
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const CODE_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-edit-001").then(m => exportModule(m));
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const DAVINCI_TOKENIZER = () => import("gpt-tokenizer/model/davinci").then(m => exportModule(m));
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const CURIE_TOKENIZER = () => import("gpt-tokenizer/model/curie").then(m => exportModule(m));
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const BABBAGE_TOKENIZER = () => import("gpt-tokenizer/model/babbage").then(m => exportModule(m));
 /**
 * @returns {Promise<TokenizerModule>}
 * @constructor
 */
 const ADA_TOKENIZER = () => import("gpt-tokenizer/model/ada").then(m => exportModule(m));
 // This mapping returns a Promise that resolves to the correct countTokens function for the model.
@ -143,49 +69,3 @@ export const MODEL_TO_MODULES = {
    "babbage": BABBAGE_TOKENIZER,
    "ada": ADA_TOKENIZER,
 };
 /**
 * @typedef {Object} EncodeOptions
 * @property {Set<string>|'all'} [allowedSpecial] - A list of special tokens that are allowed in the input.
 * If set to 'all', all special tokens are allowed except those in disallowedSpecial.
 * @default undefined
 * @property {Set<string>|'all'} [disallowedSpecial] - A list of special tokens that are disallowed in the input.
 * If set to 'all', all special tokens are disallowed except those in allowedSpecial.
 * @default 'all'
 */
 /**
 * @typedef {Object} ChatMessage
 * @property {'system'|'user'|'assistant'} [role] - The role of the message sender.
 * @property {string} [name] - The name of the message sender.
 * @property {string} content - The content of the message.
 */
 /**
 * @func EncodeFn
 * @param {string} lineToEncode - The string to encode.
 * @param {EncodeOptions} [encodeOptions] - Optional encoding options.
 * @returns {number[]} An array of numbers representing the encoded result.
 */
 /**
 * @func DecodeFn
 * @param {Iterable<number>} inputTokensToDecode - An iterable collection of numbers to decode.
 * @returns {string} The decoded string.
 */
 /**
 * A function that counts tokens.
 *
 * @func CountTokensFn
 * @param {string | Iterable<ChatMessage>} input - The input string or an iterable of ChatMessage objects.
 * @param {EncodeOptions} [encodeOptions] - Optional encoding options to customize the token counting process.
 * @returns {number} The total number of tokens counted.
 */
 /**
 * @typedef {Object} TokenizerModule
 * @property {CountTokensFn} countTokens - Function to count tokens in input
 * @property {DecodeFn} decode - Function to convert token IDs back to text
 * @property {EncodeFn} encode - Function to convert text to token IDs
 */
--- a/src/core/operations/ParseAITokens.mjs
+++ b/src/core/operations/ParseAITokens.mjs
@ -0,0 +1,147 @@
 /**
 * @author grmartin [grmartin@engineer.com]
 * @copyright Crown Copyright 2016
 * @license Apache-2.0
 */
 import Operation from "../Operation.mjs";
 import {defaultValue, MODEL_TO_MODULES} from "../lib/GPTTokenizer.mjs";
 const pastelColors = [
    "rgba(102,197,204,.4)",
    "rgba(246,207,113,.4)",
    "rgba(248,156,116,.4)",
    "rgba(239,65,70,.4)",
    "rgba(220,176,242,.4)",
    "rgba(135,197,95,.4)",
    "rgba(158,185,243,.4)",
    "rgba(254,136,177,.4)",
    "rgba(201,219,116,.4)",
    "rgba(139,224,164,.4)",
    "rgba(180,151,231,.4)",
 ];
 /**
 * Count AI Tokens operation
 */
 class ParseAITokens extends Operation {
    /**
     * Parse AI Tokens constructor
     */
    constructor() {
        super();
        this.name = "Parse AI Tokens";
        this.module = "AI";
        this.infoURL = "https://github.com/niieani/gpt-tokenizer";
        this.description = "Parses the GPT tokens in the input text using niieani/gpt-tokenizer. Select the model to use the correct encoding.";
        this.inputType = "string";
        this.outputType = "html";
        this.args = [
            {
                name: "Model",
                type: "option",
                value: Object.keys(MODEL_TO_MODULES),
            },
            {
                name: "Show Token IDs",
                type: "boolean",
                value: false
            }
        ];
    }
    /**
     * @param {string} input
     * @param {Object[]} args
     * @returns {string}
     */
    async run(input, args) {
        if (!input) return "";
        const [model, showIds] = args;
        let fns;
        if (MODEL_TO_MODULES[model]) {
            fns = (await MODEL_TO_MODULES[model]());
        } else {
            // fallback to default (gpt-3.5-turbo encoding)
            fns = (await MODEL_TO_MODULES[defaultValue]());
        }
        const encodedTokens = fns.encode(input); // IDs
        let displayTokens = [];
        if (showIds) {
            displayTokens = encodedTokens.map((x)=> x.toString());
        } else {
            const tokens = [];
            for (const token of fns.decodeGenerator(encodedTokens)) {
                tokens.push(token);
            }
            displayTokens = tokens;
        }
        return this.format(input, displayTokens);
    };
    /**
     * Format HTML
     * @param {string} input
     * @param {string[]} tokens
     */
    format(input, tokens) {
        const tokenHtml = tokens.map((t, i) => {
            const tok =
                t.replaceAll(" ", "\u00A0")
                    .replaceAll("\n", "<newline>");
            const css = [
                `background-color:${pastelColors[i % pastelColors.length]}`,
                "padding: 0 0",
                "border-radius: 3px",
                "margin-right: 0",
                "margin-bottom: 4px",
                "display: 'inline-block'",
                "height: 1.5em"
            ];
            return `<span style="${css.join(";")}">${tok}</span>`;
        });
        return this.replaceSpacesOutsideTags(`
            <div style="padding: 0; margin: 0">
                <h1>Tokens</h1>
                <p style="font-family: monospace">
                    ${tokenHtml.join("")}
                </p>
                <hr />
                <ul style="list-style: none; padding-left: 0">
                    <li><strong>Characters:</strong>&nbsp;${input.length}</li>
                    <li><strong>Tokens:</strong>&nbsp;${tokens.length}</li>
                </ul>
            </div>`
        );
    };
    /**
     * Replace all space not starting within the HTML tag.
     * @param {string} htmlString
     * @returns {string}
     */
    replaceSpacesOutsideTags(htmlString) {
        return htmlString.replace(/(<[^>]*?>)|(\s+)/g, function(match, tag, spaces) {
            if (tag) {
                return tag;
            } else if (spaces) {
                return "";
            }
        }).replace(/[\r\n]/g, "");
    };
 }
 export default ParseAITokens;