tools: Improvements to stripping and code collection, add a token cou…

…nting bin script (#37) * tools.stripping.stripJava and tools.collectGithubRepo improvements * lint * stripping: support striping java diffs, * Add a new GPT-4 tokenizer CLI tool * update readme * Update README.md
extremeheat · Apr 28, 2024 · 41d49fb · 41d49fb
1 parent f4840f6
commit 41d49fb
Show file tree

Hide file tree

Showing 7 changed files with 187 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -3,7 +3,9 @@
 [![Build Status](https://github.com/extremeheat/LXL/actions/workflows/ci.yml/badge.svg)](https://github.com/extremeheat/LXL/actions/workflows/)
 [![Gitpod ready-to-code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/extremeheat/LXL)
 
-LangXLang (LXL), a Node.js library to use OpenAI's GPT models and Google's Gemini and PaLM 2 models, with function calling support.
+LangXLang (LXL) is a Node.js library and toolkit for using large language models (LLMs) inside software applications.
+
+LXL supports function calling, caching, prompt templating role play, and building complex conversational flows with LLMs.
 
 Supported models are:
 * OpenAI: `gpt-3.5-turbo-16k`, `gpt-3.5-turbo`, `gpt-4`, `gpt-4-turbo-preview` (or any specific gpt- model listed [here](https://platform.openai.com/docs/models/))

diff --git a/bin/cli.js b/bin/cli.js
@@ -0,0 +1,39 @@
+#!/usr/bin/env node
+const gpt4 = require('gpt-tokenizer/cjs/model/gpt-4')
+
+function countTokens (text) {
+  return gpt4.encode(text).length
+}
+
+function raise (msg) {
+  if (msg) console.error(msg)
+  console.error('Usage: langxlang <command> ...args')
+  console.error('Usage: langxlang count <tokenizer> <file>')
+  console.error('Example: langxlang count gpt4 myfile.js')
+}
+
+if (process.argv.length < 3) {
+  raise()
+  process.exit(1)
+}
+
+const commands = {
+  count (tokenizer, file) {
+    if (!tokenizer || !file) {
+      raise('Must supply both a tokenizer (like gpt4) and a file')
+      process.exit(1)
+    }
+    console.log('Counting tokens in', file, 'using', tokenizer)
+    if (tokenizer === 'gpt4') {
+      const text = require('fs').readFileSync(file, 'utf8')
+      console.log('Tokens:', countTokens(text).toLocaleString())
+    } else {
+      console.error('Unknown tokenizer', tokenizer)
+      process.exit(1)
+    }
+  }
+}
+
+const [, , command, ...args] = process.argv
+console.error(`command: ${command}`, args)
+commands[command](...args)
diff --git a/package.json b/package.json
@@ -4,6 +4,9 @@
   "description": "LLM wrapper for OpenAI GPT and Google Gemini and PaLM 2 models",
   "main": "src/index.js",
   "types": "src/index.d.ts",
+  "bin": {
+    "langxlang": "bin/cli.js"
+  },
   "scripts": {
     "test": "npm run mocha",
     "pretest": "npm run lint",

diff --git a/src/index.d.ts b/src/index.d.ts
@@ -29,7 +29,7 @@ declare module 'langxlang' {
     listModels(): Promise<{ openai: Record<string, object>, google: Record<string, object> }>
 
     // Request a completion from the model with a system prompt and a single user prompt.
-    requestCompletion(model: Model, systemPrompt: string, userPrompt: string, _chunkCb?, options?: CompletionOptions & {
+    requestCompletion(model: Model, systemPrompt: string, userPrompt: string, _chunkCb?: ChunkCb, options?: CompletionOptions & {
       // If true, the response will be cached and returned from the cache if the same request is made again.
       enableCaching?: boolean
     }): Promise<CompletionResponse[]>
@@ -101,10 +101,11 @@ declare module 'langxlang' {
   interface CollectFolderOptions {
     // What extension/extension(s) of files in the repo to include
     extension?: string | string[]
-    // Either a function that returns true if the file should be included
-    // or an array of regexes of which one needs to match for inclusion
-    matching?: (fileName: string) => boolean | RegExp[]
+    // Either a function that returns true if the file should be included, or false if not (otherwise the call result is ignored).
+    // OR alternatively, pass an array of regexes of which one needs to match for inclusion.
+    matching?: ((relativePath: string, absolutePath: string, wouldBeExcluded: boolean) => boolean) | RegExp[]
     // An optional list of strings for which if the path starts with one of them, it's excluded, even if it was matched by `extension` or `matching`
+    // unless `matching` was a function and it explicitly returned `true` for the file.
     excluding?: Array<string | RegExp>
     // Try and cut down on the token size of the input by doing "stripping" to remove semantically unnecessary tokens from file
     strip?: StripOptions
@@ -122,7 +123,7 @@ declare module 'langxlang' {
     collectFolderFiles(folderPath: string, options: CollectFolderOptions): Promise<[absolutePath: string, relativePath: string, contents: string][]>
     // Returns a JS object with a list of files in a GitHub repo
     collectGithubRepoFiles(repo: string, options: CollectFolderOptions & {
-      // The branch to use
+      // The branch or ref to use
       branch?: string,
       // The URL to the repo, if it's not github.com
       url?: string,
@@ -160,7 +161,7 @@ declare module 'langxlang' {
       // Normalize line endings to \n
       normalizeLineEndings(str: string): string
       // Remove unnecessary keywords from a string
-      stripJava(input: string, options?: StripOptions): string
+      stripJava(input: string, options?: StripOptions & { removeStrings?: boolean }): string
       // Removes files from git diff matching the options.excluding regexes
       stripDiff(input: string, options?: { excluding: RegExp[] }): string
     }

diff --git a/src/tools/codebase.js b/src/tools/codebase.js
@@ -40,9 +40,14 @@ function collectFolderFiles (folder, options) {
     } else if (extension && !file.endsWith(extension)) {
       continue
     }
+    const wouldBeExcluded = excluding.some(ex => (typeof ex === 'string') ? relFile.startsWith(ex) : relFile.match(ex))
     if (options.matching) {
       if (typeof options.matching === 'function') {
-        if (!options.matching(relFile)) {
+        const matching = options.matching(relFile, file, wouldBeExcluded)
+        if (matching === false) {
+          continue
+        } else if (matching === true) {
+          relevantFiles.push([file, relFile])
           continue
         }
       } else if (Array.isArray(options.matching)) {
@@ -53,7 +58,7 @@ function collectFolderFiles (folder, options) {
         throw new Error('options.matching must be a function or an array of regexes or strings')
       }
     }
-    if (excluding.some(ex => (typeof ex === 'string') ? relFile.startsWith(ex) : relFile.match(ex))) {
+    if (wouldBeExcluded) {
       continue
     }
     relevantFiles.push([file, relFile])
@@ -89,20 +94,23 @@ function collectFolderFiles (folder, options) {
 
 // This function will clone a github repo, review all the files and merge relevant files into a single file
 function collectGithubRepoFiles (repo, options) {
-  const branch = options.branch || 'master'
+  const exec = (cmd, args) => (options.verbose ? console.log('$', cmd, args) : null, cp.execSync(cmd, args)) // eslint-disable-line no-sequences
   // First, try to clone the repo inside a "repos" folder in this directory
   const safeName = repo.replace(/\//g, ',')
   const reposDir = join(__dirname, 'repos')
   const repoPath = join(reposDir, safeName)
   fs.mkdirSync(reposDir, { recursive: true })
   if (!fs.existsSync(repoPath)) {
     const url = options.url || `https://${options.token ? options.token + '@' : ''}github.com/${repo}.git`
-    cp.execSync(`git clone ${url} ${safeName} --depth 1`, { cwd: reposDir })
+    exec(`git clone ${url} ${safeName} --depth 1`, { cwd: reposDir })
   }
+  const defaultBranch = exec('git rev-parse --abbrev-ref HEAD', { cwd: repoPath }).toString().trim()
+  const branch = options.branch || defaultBranch
+  const baseRef = branch.replace(/\^|~/g, '')
   // Git pull origin/$branch
-  cp.execSync(`git pull origin ${branch}`, { cwd: repoPath })
+  exec(`git fetch origin "${baseRef}"`, { cwd: repoPath })
   // Check out the branch
-  cp.execSync(`git checkout ${branch}`, { cwd: repoPath })
+  exec(`git checkout "${branch}"`, { cwd: repoPath })
   return collectFolderFiles(repoPath, options)
 }
 

diff --git a/src/tools/stripping.js b/src/tools/stripping.js
@@ -90,26 +90,29 @@ function stripJava (code, options) {
   const syntaxTokensToRemove = options.tokensToRemove ||
     ['protected', 'private', 'public', 'final', 'abstract', 'synchronized', 'volatile', 'transient', 'native', 'strictfp']
 
+  const ANNO_MARK = '//annotationForRemoval/ '
+
   for (const entry of tokens) {
-    if (entry[1] === 'code') {
-      for (const forRemoval of syntaxTokensToRemove) {
-        entry[0] = entry[0].replace(new RegExp('\\b' + forRemoval + ' ', 'g'), '')
-      }
-    }
     if (options.removeAnnotations) {
       if (entry[1] === 'code') {
         // console.log('Removing annotations')
         const lines = entry[0].split('\n')
         const newLines = []
         for (const line of lines) {
           if (line.trim().startsWith('@')) {
+            newLines.push(ANNO_MARK + line) // mark for later removal
             continue
           }
           newLines.push(line)
         }
         entry[0] = newLines.join('\n')
       }
     }
+    if (entry[1] === 'code') {
+      for (const forRemoval of syntaxTokensToRemove) {
+        entry[0] = entry[0].replace(new RegExp('\\b' + forRemoval + ' ', 'g'), '')
+      }
+    }
   }
   // Now we can replace some user specified tokens with other tokens. Useful for renaming variables
   if (options.replacements) {
@@ -142,14 +145,25 @@ function stripJava (code, options) {
     }
   }
   newTokens = newTokens.filter(([tokenStr, tokenType]) => tokenStr !== '')
+  if (options.removeStrings) {
+    // turn strings to empty strings
+    newTokens = newTokens.map(([tokenStr, tokenType]) => tokenType === 'string' ? ['""', tokenType] : [tokenStr, tokenType])
+  }
 
   // Now iterate through the new tokens and remove code with empty space lines
   let result = ''
   for (let i = 0; i < newTokens.length; i++) {
     const [tokenStr, tokenType] = newTokens[i]
     if (tokenType === 'code') {
       const newStrLines = []
-      for (const line of tokenStr.split('\n')) {
+      const split = tokenStr.split('\n')
+      for (let j = 0; j < split.length; j++) {
+        // skip trimming the last line, prevent issues with the next token
+        if (j === split.length - 1) {
+          newStrLines.push(split[j])
+          continue
+        }
+        const line = split[j]
         if (line.trim() === '') continue
         newStrLines.push(line)
       }
@@ -159,7 +173,20 @@ function stripJava (code, options) {
       result += tokenStr
     }
   }
-  return result
+  const lines = result.split('\n')
+  const finalLines = []
+  for (const line of lines) {
+    if (options.removeAnnotations) {
+      if (line.trim().startsWith(ANNO_MARK)) {
+        continue
+      } else if (line.includes(ANNO_MARK)) {
+        finalLines.push(line.split(ANNO_MARK)[1])
+        continue
+      }
+    }
+    finalLines.push(line)
+  }
+  return finalLines.join('\n')
 }
 
 function stripPHP (code, options = {}) {
@@ -632,24 +659,95 @@ const DEFAULT_EXCLUDE = [/node_modules/, /\.git/, /\/build\//, /\/dist\//]
 function stripDiff (diff, options = {}) {
   const exclude = options.excluding || DEFAULT_EXCLUDE
   const lines = diff.split('\n')
-  const result = []
+  const inter = []
   let inExcluded = false
-  for (const line of lines) {
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i]
+    const nextLine = lines[i + 1]
     if (line.startsWith('diff --git')) {
       inExcluded = exclude.some((ex) => ex.test(line))
+      if (options.matching) {
+        const file = line.split(' b/')[1]
+        let mode = 'modified'
+        if (nextLine.startsWith('new file')) mode = 'created'
+        else if (nextLine.startsWith('deleted file')) mode = 'deleted'
+        const matching = options.matching(file, mode, inExcluded)
+        if (matching === false) {
+          inExcluded = true
+          continue
+        }
+      }
       if (inExcluded) {
         // Treat this as a binary file
-        result.push(line)
-        result.push('index 0000000..0000000')
-        result.push('Binary files differ')
+        inter.push(line)
+        inter.push('index 0000000..0000000')
+        inter.push('Binary files differ')
       }
     }
     if (inExcluded) {
       continue
     }
-    result.push(line)
+    inter.push(line)
   }
-  return result.join('\n')
+
+  const regions = []
+  let currentFile
+  let currentFileIx
+  let currentFileContentsIx
+  for (let i = 0; i < inter.length; i++) {
+    const line = inter[i]
+    if (line.startsWith('diff --git')) {
+      if (currentFile) {
+        regions.push({ file: currentFile.trim(), start: currentFileIx, fileStart: currentFileContentsIx, end: i })
+        currentFileContentsIx = null
+      }
+      currentFile = line
+      currentFileIx = i
+    }
+    if (line.startsWith('@@')) {
+      currentFileContentsIx ||= i
+    }
+  }
+
+  regions.reverse() // we want to start from the bottom
+  const SIG_PLUS = '\t\t \t'
+  const SIG_MINUS = '\t \t\t'
+  const SUB_KEYWORD = `$STORED_${(Math.random() * 1000) | 0}_`
+  if (options.stripDiffFiles) {
+    function stripFile (region, usingMethod) {
+      const storedVariables = []
+      const slice = inter.slice(region.fileStart, region.end)
+        .map((line) => {
+          // We need to convert the git diff to normal Java so it can be stripped. But we need to keep the git data like @/+/-
+          // so we either sub+map and store or add a prefix signature (spacing is ignored so we can add a space based prefix)
+          if (line.startsWith('@@')) {
+            const forStore = line.split(' @@')
+            storedVariables.push(forStore[0] + ' @@')
+            return SUB_KEYWORD + storedVariables.length + forStore[1]
+          } else if (line.startsWith('+')) {
+            return SIG_PLUS + line.slice(1)
+          } else if (line.startsWith('-')) {
+            return SIG_MINUS + line.slice(1)
+          }
+          return line
+        })
+      const sliceStr = slice.join('\n')
+      let stripped = usingMethod(sliceStr, options)
+        .replaceAll(SIG_PLUS, '+')
+        .replaceAll(SIG_MINUS, '-')
+      for (let i = storedVariables.length - 1; i >= 0; i--) {
+        stripped = stripped.replace(SUB_KEYWORD + (i + 1), storedVariables[i])
+      }
+      const strippedLines = stripped.split('\n')
+      inter.splice(region.fileStart, region.end - region.fileStart, ...strippedLines)
+    }
+    for (const region of regions) {
+      if (!region.fileStart) continue
+      if (region.file.endsWith('.java')) stripFile(region, stripJava)
+    }
+  }
+  const result = inter.join('\n')
+  return result
 }
 
 module.exports = { stripJava, stripPHP, stripGo, stripMarkdown, stripDiff, removeNonAscii, normalizeLineEndings, tokenizeMarkdown, stripXmlComments, stripMdpComments }
diff --git a/test/tooling.test.js b/test/tooling.test.js
@@ -65,6 +65,15 @@ describe('stripping', function () {
     assert.strictEqual(block.raw.length, 867)
     assert.strictEqual(block.code.length, 858)
   })
+  it('on java', function () {
+    const s = `
+public static final EntityType<Boat> BOAT = register(
+  "boat", EntityType.Builder.<Boat>of(Boat::new, MobCategory.MISC).sized(1.375F, 0.5625F).eyeHeight(0.5625F).clientTrackingRange(10)
+)`
+    const strip = tools.stripping.stripJava(s, { removeComments: true, removeAnnotations: true, removeStrings: false })
+    // ~mostly the same but with some syntax modifiers removed
+    assert.strictEqual(s.trim().replace('public static final', 'static'), strip.trim())
+  })
 })
 
 const testObject = {