diff --git a/README.md b/README.md index 67f2bc3..dbb8fab 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,9 @@ [![Build Status](https://github.com/extremeheat/LXL/actions/workflows/ci.yml/badge.svg)](https://github.com/extremeheat/LXL/actions/workflows/) [![Gitpod ready-to-code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/extremeheat/LXL) -LangXLang (LXL), a Node.js library to use OpenAI's GPT models and Google's Gemini and PaLM 2 models, with function calling support. +LangXLang (LXL) is a Node.js library and toolkit for using large language models (LLMs) inside software applications. + +LXL supports function calling, caching, prompt templating role play, and building complex conversational flows with LLMs. Supported models are: * OpenAI: `gpt-3.5-turbo-16k`, `gpt-3.5-turbo`, `gpt-4`, `gpt-4-turbo-preview` (or any specific gpt- model listed [here](https://platform.openai.com/docs/models/)) diff --git a/bin/cli.js b/bin/cli.js new file mode 100644 index 0000000..c52edd7 --- /dev/null +++ b/bin/cli.js @@ -0,0 +1,39 @@ +#!/usr/bin/env node +const gpt4 = require('gpt-tokenizer/cjs/model/gpt-4') + +function countTokens (text) { + return gpt4.encode(text).length +} + +function raise (msg) { + if (msg) console.error(msg) + console.error('Usage: langxlang ...args') + console.error('Usage: langxlang count ') + console.error('Example: langxlang count gpt4 myfile.js') +} + +if (process.argv.length < 3) { + raise() + process.exit(1) +} + +const commands = { + count (tokenizer, file) { + if (!tokenizer || !file) { + raise('Must supply both a tokenizer (like gpt4) and a file') + process.exit(1) + } + console.log('Counting tokens in', file, 'using', tokenizer) + if (tokenizer === 'gpt4') { + const text = require('fs').readFileSync(file, 'utf8') + console.log('Tokens:', countTokens(text).toLocaleString()) + } else { + console.error('Unknown tokenizer', tokenizer) + process.exit(1) + } + } +} + +const [, , command, ...args] = process.argv +console.error(`command: ${command}`, args) +commands[command](...args) diff --git a/package.json b/package.json index 409084b..f0d7286 100644 --- a/package.json +++ b/package.json @@ -4,6 +4,9 @@ "description": "LLM wrapper for OpenAI GPT and Google Gemini and PaLM 2 models", "main": "src/index.js", "types": "src/index.d.ts", + "bin": { + "langxlang": "bin/cli.js" + }, "scripts": { "test": "npm run mocha", "pretest": "npm run lint", diff --git a/src/index.d.ts b/src/index.d.ts index 8ec96d0..b108374 100644 --- a/src/index.d.ts +++ b/src/index.d.ts @@ -29,7 +29,7 @@ declare module 'langxlang' { listModels(): Promise<{ openai: Record, google: Record }> // Request a completion from the model with a system prompt and a single user prompt. - requestCompletion(model: Model, systemPrompt: string, userPrompt: string, _chunkCb?, options?: CompletionOptions & { + requestCompletion(model: Model, systemPrompt: string, userPrompt: string, _chunkCb?: ChunkCb, options?: CompletionOptions & { // If true, the response will be cached and returned from the cache if the same request is made again. enableCaching?: boolean }): Promise @@ -101,10 +101,11 @@ declare module 'langxlang' { interface CollectFolderOptions { // What extension/extension(s) of files in the repo to include extension?: string | string[] - // Either a function that returns true if the file should be included - // or an array of regexes of which one needs to match for inclusion - matching?: (fileName: string) => boolean | RegExp[] + // Either a function that returns true if the file should be included, or false if not (otherwise the call result is ignored). + // OR alternatively, pass an array of regexes of which one needs to match for inclusion. + matching?: ((relativePath: string, absolutePath: string, wouldBeExcluded: boolean) => boolean) | RegExp[] // An optional list of strings for which if the path starts with one of them, it's excluded, even if it was matched by `extension` or `matching` + // unless `matching` was a function and it explicitly returned `true` for the file. excluding?: Array // Try and cut down on the token size of the input by doing "stripping" to remove semantically unnecessary tokens from file strip?: StripOptions @@ -122,7 +123,7 @@ declare module 'langxlang' { collectFolderFiles(folderPath: string, options: CollectFolderOptions): Promise<[absolutePath: string, relativePath: string, contents: string][]> // Returns a JS object with a list of files in a GitHub repo collectGithubRepoFiles(repo: string, options: CollectFolderOptions & { - // The branch to use + // The branch or ref to use branch?: string, // The URL to the repo, if it's not github.com url?: string, @@ -160,7 +161,7 @@ declare module 'langxlang' { // Normalize line endings to \n normalizeLineEndings(str: string): string // Remove unnecessary keywords from a string - stripJava(input: string, options?: StripOptions): string + stripJava(input: string, options?: StripOptions & { removeStrings?: boolean }): string // Removes files from git diff matching the options.excluding regexes stripDiff(input: string, options?: { excluding: RegExp[] }): string } diff --git a/src/tools/codebase.js b/src/tools/codebase.js index ef1036c..62b7281 100644 --- a/src/tools/codebase.js +++ b/src/tools/codebase.js @@ -40,9 +40,14 @@ function collectFolderFiles (folder, options) { } else if (extension && !file.endsWith(extension)) { continue } + const wouldBeExcluded = excluding.some(ex => (typeof ex === 'string') ? relFile.startsWith(ex) : relFile.match(ex)) if (options.matching) { if (typeof options.matching === 'function') { - if (!options.matching(relFile)) { + const matching = options.matching(relFile, file, wouldBeExcluded) + if (matching === false) { + continue + } else if (matching === true) { + relevantFiles.push([file, relFile]) continue } } else if (Array.isArray(options.matching)) { @@ -53,7 +58,7 @@ function collectFolderFiles (folder, options) { throw new Error('options.matching must be a function or an array of regexes or strings') } } - if (excluding.some(ex => (typeof ex === 'string') ? relFile.startsWith(ex) : relFile.match(ex))) { + if (wouldBeExcluded) { continue } relevantFiles.push([file, relFile]) @@ -89,7 +94,7 @@ function collectFolderFiles (folder, options) { // This function will clone a github repo, review all the files and merge relevant files into a single file function collectGithubRepoFiles (repo, options) { - const branch = options.branch || 'master' + const exec = (cmd, args) => (options.verbose ? console.log('$', cmd, args) : null, cp.execSync(cmd, args)) // eslint-disable-line no-sequences // First, try to clone the repo inside a "repos" folder in this directory const safeName = repo.replace(/\//g, ',') const reposDir = join(__dirname, 'repos') @@ -97,12 +102,15 @@ function collectGithubRepoFiles (repo, options) { fs.mkdirSync(reposDir, { recursive: true }) if (!fs.existsSync(repoPath)) { const url = options.url || `https://${options.token ? options.token + '@' : ''}github.com/${repo}.git` - cp.execSync(`git clone ${url} ${safeName} --depth 1`, { cwd: reposDir }) + exec(`git clone ${url} ${safeName} --depth 1`, { cwd: reposDir }) } + const defaultBranch = exec('git rev-parse --abbrev-ref HEAD', { cwd: repoPath }).toString().trim() + const branch = options.branch || defaultBranch + const baseRef = branch.replace(/\^|~/g, '') // Git pull origin/$branch - cp.execSync(`git pull origin ${branch}`, { cwd: repoPath }) + exec(`git fetch origin "${baseRef}"`, { cwd: repoPath }) // Check out the branch - cp.execSync(`git checkout ${branch}`, { cwd: repoPath }) + exec(`git checkout "${branch}"`, { cwd: repoPath }) return collectFolderFiles(repoPath, options) } diff --git a/src/tools/stripping.js b/src/tools/stripping.js index 0c4b097..e87708f 100644 --- a/src/tools/stripping.js +++ b/src/tools/stripping.js @@ -90,12 +90,9 @@ function stripJava (code, options) { const syntaxTokensToRemove = options.tokensToRemove || ['protected', 'private', 'public', 'final', 'abstract', 'synchronized', 'volatile', 'transient', 'native', 'strictfp'] + const ANNO_MARK = '//annotationForRemoval/ ' + for (const entry of tokens) { - if (entry[1] === 'code') { - for (const forRemoval of syntaxTokensToRemove) { - entry[0] = entry[0].replace(new RegExp('\\b' + forRemoval + ' ', 'g'), '') - } - } if (options.removeAnnotations) { if (entry[1] === 'code') { // console.log('Removing annotations') @@ -103,6 +100,7 @@ function stripJava (code, options) { const newLines = [] for (const line of lines) { if (line.trim().startsWith('@')) { + newLines.push(ANNO_MARK + line) // mark for later removal continue } newLines.push(line) @@ -110,6 +108,11 @@ function stripJava (code, options) { entry[0] = newLines.join('\n') } } + if (entry[1] === 'code') { + for (const forRemoval of syntaxTokensToRemove) { + entry[0] = entry[0].replace(new RegExp('\\b' + forRemoval + ' ', 'g'), '') + } + } } // Now we can replace some user specified tokens with other tokens. Useful for renaming variables if (options.replacements) { @@ -142,6 +145,10 @@ function stripJava (code, options) { } } newTokens = newTokens.filter(([tokenStr, tokenType]) => tokenStr !== '') + if (options.removeStrings) { + // turn strings to empty strings + newTokens = newTokens.map(([tokenStr, tokenType]) => tokenType === 'string' ? ['""', tokenType] : [tokenStr, tokenType]) + } // Now iterate through the new tokens and remove code with empty space lines let result = '' @@ -149,7 +156,14 @@ function stripJava (code, options) { const [tokenStr, tokenType] = newTokens[i] if (tokenType === 'code') { const newStrLines = [] - for (const line of tokenStr.split('\n')) { + const split = tokenStr.split('\n') + for (let j = 0; j < split.length; j++) { + // skip trimming the last line, prevent issues with the next token + if (j === split.length - 1) { + newStrLines.push(split[j]) + continue + } + const line = split[j] if (line.trim() === '') continue newStrLines.push(line) } @@ -159,7 +173,20 @@ function stripJava (code, options) { result += tokenStr } } - return result + const lines = result.split('\n') + const finalLines = [] + for (const line of lines) { + if (options.removeAnnotations) { + if (line.trim().startsWith(ANNO_MARK)) { + continue + } else if (line.includes(ANNO_MARK)) { + finalLines.push(line.split(ANNO_MARK)[1]) + continue + } + } + finalLines.push(line) + } + return finalLines.join('\n') } function stripPHP (code, options = {}) { @@ -632,24 +659,95 @@ const DEFAULT_EXCLUDE = [/node_modules/, /\.git/, /\/build\//, /\/dist\//] function stripDiff (diff, options = {}) { const exclude = options.excluding || DEFAULT_EXCLUDE const lines = diff.split('\n') - const result = [] + const inter = [] let inExcluded = false - for (const line of lines) { + for (let i = 0; i < lines.length; i++) { + const line = lines[i] + const nextLine = lines[i + 1] if (line.startsWith('diff --git')) { inExcluded = exclude.some((ex) => ex.test(line)) + if (options.matching) { + const file = line.split(' b/')[1] + let mode = 'modified' + if (nextLine.startsWith('new file')) mode = 'created' + else if (nextLine.startsWith('deleted file')) mode = 'deleted' + const matching = options.matching(file, mode, inExcluded) + if (matching === false) { + inExcluded = true + continue + } + } if (inExcluded) { // Treat this as a binary file - result.push(line) - result.push('index 0000000..0000000') - result.push('Binary files differ') + inter.push(line) + inter.push('index 0000000..0000000') + inter.push('Binary files differ') } } if (inExcluded) { continue } - result.push(line) + inter.push(line) } - return result.join('\n') + + const regions = [] + let currentFile + let currentFileIx + let currentFileContentsIx + for (let i = 0; i < inter.length; i++) { + const line = inter[i] + if (line.startsWith('diff --git')) { + if (currentFile) { + regions.push({ file: currentFile.trim(), start: currentFileIx, fileStart: currentFileContentsIx, end: i }) + currentFileContentsIx = null + } + currentFile = line + currentFileIx = i + } + if (line.startsWith('@@')) { + currentFileContentsIx ||= i + } + } + + regions.reverse() // we want to start from the bottom + const SIG_PLUS = '\t\t \t' + const SIG_MINUS = '\t \t\t' + const SUB_KEYWORD = `$STORED_${(Math.random() * 1000) | 0}_` + if (options.stripDiffFiles) { + function stripFile (region, usingMethod) { + const storedVariables = [] + const slice = inter.slice(region.fileStart, region.end) + .map((line) => { + // We need to convert the git diff to normal Java so it can be stripped. But we need to keep the git data like @/+/- + // so we either sub+map and store or add a prefix signature (spacing is ignored so we can add a space based prefix) + if (line.startsWith('@@')) { + const forStore = line.split(' @@') + storedVariables.push(forStore[0] + ' @@') + return SUB_KEYWORD + storedVariables.length + forStore[1] + } else if (line.startsWith('+')) { + return SIG_PLUS + line.slice(1) + } else if (line.startsWith('-')) { + return SIG_MINUS + line.slice(1) + } + return line + }) + const sliceStr = slice.join('\n') + let stripped = usingMethod(sliceStr, options) + .replaceAll(SIG_PLUS, '+') + .replaceAll(SIG_MINUS, '-') + for (let i = storedVariables.length - 1; i >= 0; i--) { + stripped = stripped.replace(SUB_KEYWORD + (i + 1), storedVariables[i]) + } + const strippedLines = stripped.split('\n') + inter.splice(region.fileStart, region.end - region.fileStart, ...strippedLines) + } + for (const region of regions) { + if (!region.fileStart) continue + if (region.file.endsWith('.java')) stripFile(region, stripJava) + } + } + const result = inter.join('\n') + return result } module.exports = { stripJava, stripPHP, stripGo, stripMarkdown, stripDiff, removeNonAscii, normalizeLineEndings, tokenizeMarkdown, stripXmlComments, stripMdpComments } diff --git a/test/tooling.test.js b/test/tooling.test.js index deeea8a..163beff 100644 --- a/test/tooling.test.js +++ b/test/tooling.test.js @@ -65,6 +65,15 @@ describe('stripping', function () { assert.strictEqual(block.raw.length, 867) assert.strictEqual(block.code.length, 858) }) + it('on java', function () { + const s = ` +public static final EntityType BOAT = register( + "boat", EntityType.Builder.of(Boat::new, MobCategory.MISC).sized(1.375F, 0.5625F).eyeHeight(0.5625F).clientTrackingRange(10) +)` + const strip = tools.stripping.stripJava(s, { removeComments: true, removeAnnotations: true, removeStrings: false }) + // ~mostly the same but with some syntax modifiers removed + assert.strictEqual(s.trim().replace('public static final', 'static'), strip.trim()) + }) }) const testObject = {