diff --git a/docs/src/content/docs/reference/cli/commands.md b/docs/src/content/docs/reference/cli/commands.md index 4714ae9ae9..41a1b4f364 100644 --- a/docs/src/content/docs/reference/cli/commands.md +++ b/docs/src/content/docs/reference/cli/commands.md @@ -588,6 +588,8 @@ Commands: code [query] Parse code using tree sitter and executes a query tokens [options] Count tokens in a set of files + tokenize [options] Tokenizes a piece of text and display the + tokens (in hex format) jsonl2json Converts JSONL files to a JSON file prompty [options] Converts .prompty files to genaiscript jinja2 [options] Renders Jinja2 or prompty template @@ -684,6 +686,21 @@ Options: -h, --help display help for command ``` +### `parse tokenize` + +``` +Usage: genaiscript parse tokenize [options] + +Tokenizes a piece of text and display the tokens (in hex format) + +Arguments: + file file to tokenize + +Options: + -m, --model encoding model + -h, --help display help for command +``` + ### `parse jsonl2json` ``` diff --git a/genaisrc/docs.genai.mts b/genaisrc/docs.genai.mts index d2a4d3c79d..362bd5129d 100644 --- a/genaisrc/docs.genai.mts +++ b/genaisrc/docs.genai.mts @@ -162,7 +162,7 @@ async function generateDocs(file: WorkspaceFile, fileStats: any) { _.def("FILE", missingDoc.getRoot().root().text()) _.def("FUNCTION", missingDoc.text()) // this needs more eval-ing - _.$`Generate a function documentation for . + _.$`Generate a TypeScript function documentation for . - Make sure parameters are documented. - Be concise. Use technical tone. - do NOT include types, this is for TypeScript. @@ -257,8 +257,8 @@ rule: _.def("DOCSTRING", comment.text(), { flex: 10 }) _.def("FUNCTION", match.text(), { flex: 10 }) // this needs more eval-ing - _.$`Update the docstring to match the code in function . - - If the docstring is up to date, return /NOP/. + _.$`Update the TypeScript docstring to match the code in function . + - If the docstring is up to date, return /NOP/. It's ok to leave it as is. - do not rephrase an existing sentence if it is correct. - Make sure parameters are documented. - do NOT include types, this is for TypeScript. @@ -268,6 +268,15 @@ rule: The full source of the file is in for reference. The source of the function is in . The current docstring is . + + docstring: + + /** + * description + * @param param1 - description + * @param param2 - description + * @returns description + */ ` }, { diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts index 1d250d90aa..7689e6c3bd 100644 --- a/packages/cli/src/cli.ts +++ b/packages/cli/src/cli.ts @@ -22,6 +22,7 @@ import { parseMarkdown, parsePDF, parseSecrets, + parseTokenize, parseTokens, prompty2genaiscript, } from "./parse" // Parsing functions @@ -77,17 +78,55 @@ import { DEBUG_CATEGORIES } from "../../core/src/dbg" /** * Main function to initialize and run the CLI. * - * Sets up global error handling for uncaught exceptions. - * Verifies Node.js version compatibility. - * Configures CLI options and commands, including: - * - `configure`: Interactive help to configure providers. - * - `run`: Executes a GenAIScript against files with various options for output, retries, and caching. - * - `runs`: Commands to manage and list previous runs. - * - `test`: Group of commands for running and managing tests, including listing and viewing tests. - * - `convert`: Converts files through a GenAIScript with options for output, concurrency, and file-specific settings. - * Handles environment setup and NodeHost installation. - * Adds support for various CLI options such as working directory, environment files, color output, verbosity, and performance logging. - * Includes error handling for request errors and runtime compatibility issues. + * @param script - The script to execute. + * @param files - Optional list of files to process. + * @param cwd - Working directory for the CLI. + * @param env - Paths to environment files. + * @param noColors - Disable color output. + * @param quiet - Disable verbose output. + * @param debug - Debug categories to enable. + * @param perf - Enable performance logging. + * @param provider - Preferred LLM provider aliases. + * @param accept - Comma-separated list of accepted file extensions. + * @param excludedFiles - List of files to exclude. + * @param ignoreGitIgnore - Disable exclusion of files ignored by .gitignore. + * @param fallbackTools - Enable prompt-based tools instead of built-in LLM tool calls. + * @param out - Output folder for results. + * @param removeOut - Remove output folder if it exists. + * @param outTrace - Output file for trace. + * @param outOutput - Output file for output. + * @param outData - Output file for data, including JSON schema validation. + * @param outAnnotations - Output file for annotations. + * @param outChangelog - Output file for changelogs. + * @param pullRequest - Pull request identifier. + * @param pullRequestComment - Create a comment on a pull request with a unique ID. + * @param pullRequestDescription - Create a comment on a pull request description with a unique ID. + * @param pullRequestReviews - Create pull request reviews from annotations. + * @param teamsMessage - Post a message to the Teams channel. + * @param json - Emit full JSON response to output. + * @param yaml - Emit full YAML response to output. + * @param failOnErrors - Fail on detected annotation errors. + * @param retry - Number of retries for the run. + * @param retryDelay - Minimum delay between retries. + * @param maxDelay - Maximum delay between retries. + * @param label - Label for the run. + * @param temperature - Temperature for the run. + * @param topP - Top-p for the run. + * @param maxTokens - Maximum completion tokens for the run. + * @param maxDataRepairs - Maximum data repairs. + * @param maxToolCalls - Maximum tool calls for the run. + * @param toolChoice - Tool choice for the run. + * @param seed - Seed for the run. + * @param cache - Enable LLM result cache. + * @param cacheName - Custom cache file name. + * @param csvSeparator - CSV separator. + * @param fenceFormat - Fence format for output. + * @param applyEdits - Apply file edits. + * @param vars - Variables as name=value pairs. + * @param runRetry - Number of retries for the entire run. + * @param noRunTrace - Disable automatic trace generation. + * @param noOutputTrace - Disable automatic output generation. + * @returns Exit code indicating success or failure. */ export async function cli() { let nodeHost: NodeHost // Variable to hold NodeHost instance @@ -611,6 +650,14 @@ export async function cli() { .arguments("") .option("-ef, --excluded-files ", "excluded files") .action(parseTokens) // Action to count tokens in files + parser + .command("tokenize") + .argument("", "file to tokenize") + .description( + "Tokenizes a piece of text and display the tokens (in hex format)" + ) + .option("-m, --model ", "encoding model") + .action(parseTokenize) parser .command("jsonl2json", "Converts JSONL files to a JSON file") .argument("", "input JSONL files") diff --git a/packages/cli/src/parse.ts b/packages/cli/src/parse.ts index 6659f1d238..d6a22bb0f5 100644 --- a/packages/cli/src/parse.ts +++ b/packages/cli/src/parse.ts @@ -14,7 +14,11 @@ import { parsePdf } from "../../core/src/pdf" import { estimateTokens } from "../../core/src/tokens" import { YAMLStringify } from "../../core/src/yaml" import { resolveTokenEncoder } from "../../core/src/encoders" -import { MD_REGEX, PROMPTY_REGEX } from "../../core/src/constants" +import { + CONSOLE_TOKEN_COLORS, + MD_REGEX, + PROMPTY_REGEX, +} from "../../core/src/constants" import { promptyParse, promptyToGenAIScript } from "../../core/src/prompty" import { basename, join } from "node:path" import { CSVStringify, dataToMarkdownTable } from "../../core/src/csv" @@ -31,6 +35,10 @@ import { chunkMarkdown } from "../../core/src/mdchunk" import { normalizeInt } from "../../core/src/cleaners" import { prettyBytes } from "../../core/src/pretty" import { terminalSize } from "../../core/src/terminal" +import { consoleColors, wrapColor } from "../../core/src/consolecolor" +import { genaiscriptDebug } from "../../core/src/debug" +import { stderr, stdout } from "../../core/src/stdio" +const dbg = genaiscriptDebug("cli:parse") /** * This module provides various parsing utilities for different file types such @@ -230,7 +238,7 @@ export async function jsonl2json(files: string[]) { /** * Estimates the number of tokens in the content of files and logs the results. * @param filesGlobs - An array of files or glob patterns to process. - * @param options - Options for excluding files, specifying the model, and ignoring .gitignore. + * @param options - Options for processing files. * - excludedFiles - A list of files to exclude from processing. * - model - The name of the model used for token encoding. * - ignoreGitIgnore - Whether to ignore .gitignore rules when expanding files. @@ -261,6 +269,36 @@ export async function parseTokens( console.log(text) } +/** + * Tokenizes the content of a specified file using a provided model and logs the tokens. + * + * @param file - Path to the file to tokenize. + * @param options - Object containing the following properties: + * - model - The name of the model used for token encoding. + * + * The function reads the content of the file, tokenizes it using the given model, + * and logs each token along with its hexadecimal representation. + * Debug information about the process is also logged. + */ +export async function parseTokenize(file: string, options: { model: string }) { + const text = await readText(file) + dbg(`text: %s`, text) + const { model } = options || {} + const { + model: tokenModel, + encode: encoder, + decode: decoder, + } = await resolveTokenEncoder(model) + + console.debug(`model: %s`, tokenModel) + const tokens = encoder(text) + for (const token of tokens) { + stdout.write( + `(${wrapColor(CONSOLE_TOKEN_COLORS[0], decoder([token]))}, x${wrapColor(CONSOLE_TOKEN_COLORS[1], token.toString(16))})` + ) + } +} + /** * Converts "prompty" format files to GenAI script files. * diff --git a/packages/core/src/runpromptcontext.ts b/packages/core/src/runpromptcontext.ts index 6f384da123..e563171ffe 100644 --- a/packages/core/src/runpromptcontext.ts +++ b/packages/core/src/runpromptcontext.ts @@ -108,25 +108,6 @@ import { dotGenaiscriptPath } from "./workdir" import { prettyBytes } from "./pretty" import { createCache } from "./cache" -/** - * Creates a context for generating chat turn prompts. - * - * @param options - Contains generation options such as model, lineNumbers, and fenceFormat. - * @param trace - Trace object used to log the process and record outputs. - * - * @returns A context object used to generate prompt nodes and manage output-related functionalities. - * - * The returned context includes methods: - * - `writeText`: Appends a text node to the prompt for a specified role, priority, and max token limit. - * - `assistant`: Shortcut for writing text with an "assistant" role. - * - `$`: Creates and appends a template string node, returning a chainable interface for modifiers (e.g., priority, transforms). - * - `def`: Creates and appends a definition node for body content or external files, supporting error handling for empty definitions. - * - `defData`: Adds structured data as a definition. - * - `defDiff`: Adds a diff comparison between two data sets. - * - `fence`: Shortcut for creating definition nodes with code fences. - * - `importTemplate`: Imports a pre-defined template with associated data and appends it to the node tree. - * - `console`: Diagnostic methods (`log`, `debug`, `warn`, `error`) for capturing and printing logs or errors during the generation process. - */ export function createChatTurnGenerationContext( options: GenerationOptions, trace: MarkdownTrace, @@ -427,33 +408,6 @@ export interface RunPromptContextNode extends ChatGenerationContext { node: PromptNode } -/** - * Creates a chat generation context for handling prompts and related tasks in a conversational context. - * - * @param options - Configuration options for generation, including cancellation token, info callback, and user state. - * @param trace - Tracing utilities for logging and debugging execution. - * @param projectOptions - Project-specific parameters, including the project instance and environment variables. - * @returns A context object with various utility functions and properties for managing prompts and AI interactions. - * - * Utility Functions: - * - `defAgent(name, description, fn, options)`: Defines an agent with tools, memory, and task-solving capabilities. - * - `defTool(name, description, parameters, fn, defOptions)`: Registers a tool for use in the chat session. Supports multiple formats for tool definitions, including callbacks and MCP server configurations. - * - `defSchema(name, schema, defOptions)`: Defines a JSON schema for validation or metadata. - * - `defImages(files, defOptions)`: Processes and encodes image files for their integration into prompts. Supports tiling and slicing of images. - * - `defChatParticipant(generator, options)`: Adds chat participant logic (e.g., other agents or external systems). - * - `defFileOutput(pattern, description, options)`: Specifies output file patterns for tracking in the session. - * - `defOutputProcessor(fn)`: Adds output post-processing logic. - * - `defFileMerge(fn)`: Declares logic for merging file changes in the session output. - * - `prompt(strings, ...args)`: Runs a prompt with given input and additional options. - * - `runPrompt(generator, runOptions)`: Executes prompt logic and generates results, supporting inner execution. - * - `transcribe(audio, options)`: Transcribes audio input into text, with optional caching and language options. - * - `speak(input, options)`: Converts text to speech and saves the audio file. - * - `generateImage(prompt, imageOptions)`: Generates an image based on a textual description and custom options. - * - * Context Properties: - * - `node`: Root node capturing children elements in the prompt structure. - * - `env`: Environment variables passed to the project context for dynamic adjustments. - */ export function createChatGenerationContext( options: GenerationOptions, trace: MarkdownTrace,