Merge 6324a3a808 into c57556f49f

2025-06-13 17:55:27 -04:00 · 2025-06-10 22:28:08 +00:00 · 2025-06-10 22:28:08 +00:00 · 59bebc1ed7
commit 59bebc1ed7
parent c57556f49f 6324a3a808
6 changed files with 291 additions and 0 deletions
--- a/package-lock.json
+++ b/package-lock.json
@ -46,6 +46,7 @@
        "file-saver": "^2.0.5",
        "flat": "^6.0.1",
        "geodesy": "1.1.3",
+        "gpt-tokenizer": "^2.9.0",
        "handlebars": "^4.7.8",
        "hash-wasm": "^4.12.0",
        "highlight.js": "^11.9.0",
@ -10361,6 +10362,11 @@
        "url": "https://github.com/sponsors/ljharb"
      }
    },
+    "node_modules/gpt-tokenizer": {
+      "version": "2.9.0",
+      "resolved": "https://registry.npmjs.org/gpt-tokenizer/-/gpt-tokenizer-2.9.0.tgz",
+      "integrity": "sha512-YSpexBL/k4bfliAzMrRqn3M6+it02LutVyhVpDeMKrC/O9+pCe/5s8U2hYKa2vFLD5/vHhsKc8sOn/qGqII8Kg=="
+    },
    "node_modules/graceful-fs": {
      "version": "4.2.11",
      "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
--- a/package.json
+++ b/package.json
@ -132,6 +132,7 @@
    "file-saver": "^2.0.5",
    "flat": "^6.0.1",
    "geodesy": "1.1.3",
+    "gpt-tokenizer": "^2.9.0",
    "handlebars": "^4.7.8",
    "hash-wasm": "^4.12.0",
    "highlight.js": "^11.9.0",
--- a/src/core/config/Categories.json
+++ b/src/core/config/Categories.json
@ -3,6 +3,10 @@
        "name": "Favourites",
        "ops": []
    },
+    {
+        "name": "AI",
+        "ops": ["Count AI Tokens", "Parse AI Tokens"]
+    },
    {
        "name": "Data format",
        "ops": [
--- a/src/core/lib/GPTTokenizer.mjs
+++ b/src/core/lib/GPTTokenizer.mjs
@ -0,0 +1,71 @@
+// noinspection SpellCheckingInspection
+
+/**
+ * @author grmartin [grmartin@engineer.com]
+ * @copyright Crown Copyright 2016
+ * @license Apache-2.0
+ */
+
+const exportModule = (m) => {
+    return {
+        countTokens: m.countTokens, // # of tokens
+        encode: m.encode,           // tokens ids
+        decodeGenerator: m.decodeGenerator, // tokens
+    };
+};
+
+export const defaultValue = Symbol("*");
+
+// Tokenizer module constants
+const GPT_35_TURBO_TOKENIZER = () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => exportModule(m));
+const TEXT_EMBEDDING_ADA_002_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-ada-002").then(m => exportModule(m));
+const TEXT_EMBEDDING_3_LARGE_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-large").then(m => exportModule(m));
+const TEXT_EMBEDDING_3_SMALL_TOKENIZER = () => import("gpt-tokenizer/model/text-embedding-3-small").then(m => exportModule(m));
+const CODE_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-002").then(m => exportModule(m));
+const CODE_CUSHMAN_002_TOKENIZER = () => import("gpt-tokenizer/model/code-cushman-002").then(m => exportModule(m));
+const TEXT_DAVINCI_002_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-002").then(m => exportModule(m));
+const TEXT_DAVINCI_003_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-003").then(m => exportModule(m));
+const TEXT_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/text-davinci-edit-001").then(m => exportModule(m));
+const CODE_DAVINCI_EDIT_001_TOKENIZER = () => import("gpt-tokenizer/model/code-davinci-edit-001").then(m => exportModule(m));
+const DAVINCI_TOKENIZER = () => import("gpt-tokenizer/model/davinci").then(m => exportModule(m));
+const CURIE_TOKENIZER = () => import("gpt-tokenizer/model/curie").then(m => exportModule(m));
+const BABBAGE_TOKENIZER = () => import("gpt-tokenizer/model/babbage").then(m => exportModule(m));
+const ADA_TOKENIZER = () => import("gpt-tokenizer/model/ada").then(m => exportModule(m));
+
+// This mapping returns a Promise that resolves to the correct countTokens function for the model.
+export const MODEL_TO_MODULES = {
+    // cl100k_base models
+    [defaultValue]: GPT_35_TURBO_TOKENIZER,
+    "gpt-4": GPT_35_TURBO_TOKENIZER,
+    "gpt-4-32k": GPT_35_TURBO_TOKENIZER,
+    "gpt-4-turbo": GPT_35_TURBO_TOKENIZER,
+    "gpt-4o": GPT_35_TURBO_TOKENIZER,
+    "gpt-4-0125-preview": GPT_35_TURBO_TOKENIZER,
+    "gpt-4-1106-preview": GPT_35_TURBO_TOKENIZER,
+    "gpt-3.5-turbo": GPT_35_TURBO_TOKENIZER,
+    "gpt-3.5-turbo-16k": GPT_35_TURBO_TOKENIZER,
+    "gpt-3.5-turbo-instruct": GPT_35_TURBO_TOKENIZER,
+    "gpt-3.5-turbo-0125": GPT_35_TURBO_TOKENIZER,
+    "gpt-3.5-turbo-1106": GPT_35_TURBO_TOKENIZER,
+    "text-embedding-ada-002": TEXT_EMBEDDING_ADA_002_TOKENIZER,
+    "text-embedding-3-large": TEXT_EMBEDDING_3_LARGE_TOKENIZER,
+    "text-embedding-3-small": TEXT_EMBEDDING_3_SMALL_TOKENIZER,
+
+    // p50k_base models
+    "code-davinci-002": CODE_DAVINCI_002_TOKENIZER,
+    "code-davinci-001": CODE_DAVINCI_002_TOKENIZER,
+    "code-cushman-002": CODE_CUSHMAN_002_TOKENIZER,
+    "code-cushman-001": CODE_CUSHMAN_002_TOKENIZER,
+    "text-davinci-002": TEXT_DAVINCI_002_TOKENIZER,
+    "text-davinci-003": TEXT_DAVINCI_003_TOKENIZER,
+
+    // p50k_edit models
+    "text-davinci-edit-001": TEXT_DAVINCI_EDIT_001_TOKENIZER,
+    "code-davinci-edit-001": CODE_DAVINCI_EDIT_001_TOKENIZER,
+
+    // r50k_base models
+    "davinci": DAVINCI_TOKENIZER,
+    "curie": CURIE_TOKENIZER,
+    "babbage": BABBAGE_TOKENIZER,
+    "ada": ADA_TOKENIZER,
+};
--- a/src/core/operations/CountAITokens.mjs
+++ b/src/core/operations/CountAITokens.mjs
@ -0,0 +1,59 @@
+/**
+ * @author grmartin [grmartin@engineer.com]
+ * @copyright Crown Copyright 2016
+ * @license Apache-2.0
+ */
+
+import Operation from "../Operation.mjs";
+import {defaultValue, MODEL_TO_MODULES} from "../lib/GPTTokenizer.mjs";
+
+/**
+ * Count AI Tokens operation
+ */
+class CountAITokens extends Operation {
+
+    /**
+     * Count AI Tokens constructor
+     */
+    constructor() {
+        super();
+
+        this.name = "Count AI Tokens";
+        this.module = "AI";
+        this.infoURL = "https://github.com/niieani/gpt-tokenizer";
+        this.description = "Counts the number of GPT tokens in the input text using niieani/gpt-tokenizer. Select the model to use the correct encoding.";
+        this.inputType = "string";
+        this.outputType = "string";
+        this.args = [
+            {
+                name: "Model",
+                type: "option",
+                value: Object.keys(MODEL_TO_MODULES),
+            }
+        ];
+    }
+
+    /**
+     * @param {string} input
+     * @param {Object[]} args
+     * @returns {string}
+     */
+    async run(input, args) {
+        if (!input) return "";
+
+        const [model] = args;
+        let countTokensFn;
+        if (MODEL_TO_MODULES[model]) {
+            countTokensFn = (await MODEL_TO_MODULES[model]()).countTokens;
+        } else {
+            // fallback to default (gpt-3.5-turbo encoding)
+            countTokensFn = (await MODEL_TO_MODULES[defaultValue]()).countTokens;
+        }
+        const tokenCount = countTokensFn(input);
+        return tokenCount.toString();
+    }
+
+}
+
+export default CountAITokens;
+
--- a/src/core/operations/ParseAITokens.mjs
+++ b/src/core/operations/ParseAITokens.mjs
@ -0,0 +1,150 @@
+/**
+ * @author grmartin [grmartin@engineer.com]
+ * @copyright Crown Copyright 2016
+ * @license Apache-2.0
+ */
+
+import Operation from "../Operation.mjs";
+import {defaultValue, MODEL_TO_MODULES} from "../lib/GPTTokenizer.mjs";
+
+const pastelColors = [
+    "rgba(102,197,204,.4)",
+    "rgba(246,207,113,.4)",
+    "rgba(248,156,116,.4)",
+    "rgba(239,65,70,.4)",
+    "rgba(220,176,242,.4)",
+    "rgba(135,197,95,.4)",
+    "rgba(158,185,243,.4)",
+    "rgba(254,136,177,.4)",
+    "rgba(201,219,116,.4)",
+    "rgba(139,224,164,.4)",
+    "rgba(180,151,231,.4)",
+];
+
+/**
+ * Count AI Tokens operation
+ */
+class ParseAITokens extends Operation {
+
+    /**
+     * Parse AI Tokens constructor
+     */
+    constructor() {
+        super();
+
+        this.name = "Parse AI Tokens";
+        this.module = "AI";
+        this.infoURL = "https://github.com/niieani/gpt-tokenizer";
+        this.description = "Parses the GPT tokens in the input text using niieani/gpt-tokenizer. Select the model to use the correct encoding.";
+        this.inputType = "string";
+        this.outputType = "html";
+        this.args = [
+            {
+                name: "Model",
+                type: "option",
+                value: Object.keys(MODEL_TO_MODULES),
+            },
+            {
+                name: "Show Token IDs",
+                type: "boolean",
+                value: false
+            }
+        ];
+    }
+
+    /**
+     * @param {string} input
+     * @param {Object[]} args
+     * @returns {string}
+     */
+    async run(input, args) {
+        if (!input) return "";
+
+        const [model, showIds] = args;
+        let fns;
+        if (MODEL_TO_MODULES[model]) {
+            fns = (await MODEL_TO_MODULES[model]());
+        } else {
+            // fallback to default (gpt-3.5-turbo encoding)
+            fns = (await MODEL_TO_MODULES[defaultValue]());
+        }
+
+        const encodedTokens = fns.encode(input); // IDs
+
+        let displayTokens;
+        if (showIds) {
+            displayTokens = encodedTokens.map((x)=> x.toString());
+        } else {
+            const tokens = [];
+            for (const token of fns.decodeGenerator(encodedTokens)) {
+                tokens.push(token);
+            }
+            displayTokens = tokens;
+        }
+
+        return this.format(input, displayTokens);
+
+    };
+
+    /**
+     * Format HTML
+     * @param {string} input
+     * @param {string[]} tokens
+     */
+    format(input, tokens) {
+
+        const tokenHtml = tokens.map((t, i) => {
+            const tok =
+                t.replace(/[\u00A0-\u9999<>&]/g, i => "&#"+i.charCodeAt(0)+";")
+                    .replaceAll(" ", "\u00A0")
+                    .replaceAll("\n", "<newline>");
+
+            const css = [
+                `background-color:${pastelColors[i % pastelColors.length]}`,
+                "padding: 0 0",
+                "border-radius: 3px",
+                "margin-right: 0",
+                "margin-bottom: 4px",
+                "display: 'inline-block'",
+                "height: 1.5em"
+            ];
+
+            return `<span style="${css.join(";")}">${tok}</span>`;
+        });
+
+        return this.replaceSpacesOutsideTags(`
+            <div style="padding: 0; margin: 0">
+                <h1>Tokens</h1>
+                <p style="font-family: monospace">
+                    ${tokenHtml.join("")}
+                </p>
+                <hr />
+                <ul style="list-style: none; padding-left: 0">
+                    <li><strong>Characters:</strong>&nbsp;${input.length}</li>
+                    <li><strong>Tokens:</strong>&nbsp;${tokens.length}</li>
+                </ul>
+            </div>`
+        );
+    };
+
+    /**
+     * Replace spaces outside HTML tags and sanitize <script> tags.
+     * @param {string} htmlString - The input HTML string.
+     * @returns {string} - The sanitized and formatted HTML string.
+     */
+    replaceSpacesOutsideTags(htmlString) {
+        return htmlString
+            .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/ig, "")
+            .replace(/(<[^>]*?>)|(\s+)/g, function(match, tag, spaces) {
+                if (tag) {
+                    return tag;
+                } else if (spaces) {
+                    return "";
+                }
+            })
+            .replace(/[\r\n]/g, "");
+    }
+
+}
+
+export default ParseAITokens;