From 6df8c1004f1fb6e522716922d9b31427bf00f126 Mon Sep 17 00:00:00 2001 From: "Glenn R. Martin" <222487+grmartin@users.noreply.github.com> Date: Tue, 3 Jun 2025 09:22:20 -0400 Subject: [PATCH] Adding AI Token Counter (cherry picked from commit 8b20dcbf1b52e3accc430655c2d1d94f058171fe) --- package-lock.json | 6 ++ package.json | 1 + src/core/operations/CountAITokens.mjs | 100 ++++++++++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100644 src/core/operations/CountAITokens.mjs diff --git a/package-lock.json b/package-lock.json index b374df4b..6ef3b9f1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -46,6 +46,7 @@ "file-saver": "^2.0.5", "flat": "^6.0.1", "geodesy": "1.1.3", + "gpt-tokenizer": "^2.9.0", "handlebars": "^4.7.8", "hash-wasm": "^4.12.0", "highlight.js": "^11.9.0", @@ -10361,6 +10362,11 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/gpt-tokenizer": { + "version": "2.9.0", + "resolved": "https://registry.npmjs.org/gpt-tokenizer/-/gpt-tokenizer-2.9.0.tgz", + "integrity": "sha512-YSpexBL/k4bfliAzMrRqn3M6+it02LutVyhVpDeMKrC/O9+pCe/5s8U2hYKa2vFLD5/vHhsKc8sOn/qGqII8Kg==" + }, "node_modules/graceful-fs": { "version": "4.2.11", "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", diff --git a/package.json b/package.json index 9191ab6f..aab5da6a 100644 --- a/package.json +++ b/package.json @@ -132,6 +132,7 @@ "file-saver": "^2.0.5", "flat": "^6.0.1", "geodesy": "1.1.3", + "gpt-tokenizer": "^2.9.0", "handlebars": "^4.7.8", "hash-wasm": "^4.12.0", "highlight.js": "^11.9.0", diff --git a/src/core/operations/CountAITokens.mjs b/src/core/operations/CountAITokens.mjs new file mode 100644 index 00000000..ecba26ee --- /dev/null +++ b/src/core/operations/CountAITokens.mjs @@ -0,0 +1,100 @@ +/** + * @author grmartin [grmartin] + * @copyright Crown Copyright 2016 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; + +// This mapping returns a Promise that resolves to the correct countTokens function for the model. +const MODEL_TO_COUNT_TOKENS = { + // cl100k_base models + "gpt-4": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-4-32k": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-4-turbo": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-4o": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-4-0125-preview": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-4-1106-preview": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-3.5-turbo": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-3.5-turbo-16k": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-3.5-turbo-instruct": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-3.5-turbo-0125": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "gpt-3.5-turbo-1106": () => import("gpt-tokenizer/model/gpt-3.5-turbo").then(m => m.countTokens), + "text-embedding-ada-002": () => import("gpt-tokenizer/model/text-embedding-ada-002").then(m => m.countTokens), + "text-embedding-3-large": () => import("gpt-tokenizer/model/text-embedding-3-large").then(m => m.countTokens), + "text-embedding-3-small": () => import("gpt-tokenizer/model/text-embedding-3-small").then(m => m.countTokens), + + // p50k_base models + "code-davinci-002": () => import("gpt-tokenizer/model/code-davinci-002").then(m => m.countTokens), + "code-davinci-001": () => import("gpt-tokenizer/model/code-davinci-002").then(m => m.countTokens), + "code-cushman-002": () => import("gpt-tokenizer/model/code-cushman-002").then(m => m.countTokens), + "code-cushman-001": () => import("gpt-tokenizer/model/code-cushman-002").then(m => m.countTokens), + "text-davinci-002": () => import("gpt-tokenizer/model/text-davinci-002").then(m => m.countTokens), + "text-davinci-003": () => import("gpt-tokenizer/model/text-davinci-003").then(m => m.countTokens), + + // p50k_edit models + "text-davinci-edit-001": () => import("gpt-tokenizer/model/text-davinci-edit-001").then(m => m.countTokens), + "code-davinci-edit-001": () => import("gpt-tokenizer/model/code-davinci-edit-001").then(m => m.countTokens), + + // r50k_base models + "davinci": () => import("gpt-tokenizer/model/davinci").then(m => m.countTokens), + "curie": () => import("gpt-tokenizer/model/curie").then(m => m.countTokens), + "babbage": () => import("gpt-tokenizer/model/babbage").then(m => m.countTokens), + "ada": () => import("gpt-tokenizer/model/ada").then(m => m.countTokens), +}; + + +/** + * Count AI Tokens operation + */ +class CountAITokens extends Operation { + + /** + * Count AI Tokens constructor + */ + constructor() { + super(); + + this.name = "Count AI Tokens"; + this.module = "AI"; + this.infoURL = "https://github.com/niieani/gpt-tokenizer"; + this.description = "Counts the number of GPT tokens in the input text using niieani/gpt-tokenizer. Select the model to use the correct encoding."; + this.inputType = "string"; + this.outputType = "string"; + this.args = [ + { + name: "Model", + type: "option", + value: Object.keys(MODEL_TO_COUNT_TOKENS), + } + ]; + } + + /** + * @param {string} input + * @param {Object[]} args + * @returns {string} + */ + async run(input, args) { + if (!input) return ""; + // const [model] = args; + // // Use the mapping, fallback to cl100k_base if not found + // const encoding = MODEL_TO_ENCODING[model] || cl100k_base; + // const tokenCount = encoding.; + // return tokenCount.toString(); + const [model] = args; + let countTokensFn; + if (MODEL_TO_COUNT_TOKENS[model]) { + countTokensFn = await MODEL_TO_COUNT_TOKENS[model](); + } else { + // fallback to default (gpt-3.5-turbo encoding) + countTokensFn = (await import("gpt-tokenizer/model/gpt-3.5-turbo")).countTokens; + } + const tokenCount = countTokensFn(input); + return tokenCount.toString(); + } + +} + +export default CountAITokens; +