From 53a1c55f84a9dc3b353d17681f4a0c5074ab6a29 Mon Sep 17 00:00:00 2001
From: Nicolas van Kempen <nvankempen@meta.com>
Date: Wed, 4 Oct 2023 11:04:15 +0100
Subject: [PATCH] Factor out some functions into llm-utils

---
 pyproject.toml   |  2 +-
 src/cwhy/cwhy.py | 98 ++++--------------------------------------------
 2 files changed, 9 insertions(+), 91 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 34270ab..7312aa2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ authors = [
   { name="Nicolas van Kempen", email="nvankemp@gmail.com" },
   { name="Bryce Adelstein Lelbach", email="brycelelbach@gmail.com" }
 ]
-dependencies = ["openai>=0.27.0", "tiktoken>=0.4.0"]
+dependencies = ["openai==0.28.1", "llm_utils==0.1.4"]
 description = "Explains and proposes fixes for compile-time errors for many programming languages."
 readme = "README.md"
 requires-python = ">=3.7"
diff --git a/src/cwhy/cwhy.py b/src/cwhy/cwhy.py
index 69e8e33..c60cebb 100755
--- a/src/cwhy/cwhy.py
+++ b/src/cwhy/cwhy.py
@@ -7,91 +7,7 @@
 from typing import Dict, List, Tuple
 
 import openai
-import tiktoken
-
-
-def word_wrap_except_code_blocks(text: str) -> str:
-    """
-    Wraps text except for code blocks.
-
-    Splits the text into paragraphs and wraps each paragraph,
-    except for paragraphs that are inside of code blocks denoted
-    by ` ``` `. Returns the updated text.
-
-    Args:
-        text: The text to wrap.
-
-    Returns:
-        The wrapped text.
-    """
-    # Split text into paragraphs
-    paragraphs = text.split("\n\n")
-    wrapped_paragraphs = []
-    # Check if currently in a code block.
-    in_code_block = False
-    # Loop through each paragraph and apply appropriate wrapping.
-    for paragraph in paragraphs:
-        # If this paragraph starts and ends with a code block, add it as is.
-        if paragraph.startswith("```") and paragraph.endswith("```"):
-            wrapped_paragraphs.append(paragraph)
-            continue
-        # If this is the beginning of a code block add it as is.
-        if paragraph.startswith("```"):
-            in_code_block = True
-            wrapped_paragraphs.append(paragraph)
-            continue
-        # If this is the end of a code block stop skipping text.
-        if paragraph.endswith("```"):
-            in_code_block = False
-            wrapped_paragraphs.append(paragraph)
-            continue
-        # If we are currently in a code block add the paragraph as is.
-        if in_code_block:
-            wrapped_paragraphs.append(paragraph)
-        else:
-            # Otherwise, apply text wrapping to the paragraph.
-            wrapped_paragraph = textwrap.fill(paragraph)
-            wrapped_paragraphs.append(wrapped_paragraph)
-    # Join all paragraphs into a single string
-    wrapped_text = "\n\n".join(wrapped_paragraphs)
-    return wrapped_text
-
-
-def read_lines(file_path, start_line, end_line):
-    """
-    Read lines from a file.
-
-    Args:
-        file_path (str): The path of the file to read.
-        start_line (int): The line number of the first line to include (1-indexed). Will be bounded below by 0.
-        end_line (int): The line number of the last line to include (1-indexed). Will be bounded above by file's line count.
-
-    Returns:
-        The lines read as an array and the number of the first line included.
-
-    Raises:
-        FileNotFoundError: If the file does not exist.
-    """
-    max_chars_per_line = 128  # Prevent pathological case where lines are REALLY long.
-
-    def truncate(s, l):
-        """
-        Truncate the string to at most the given length, adding ellipses if truncated.
-        """
-        if len(s) < l:
-            return s
-        else:
-            return s[:l] + "..."
-
-    with open(file_path, "r") as f:
-        lines = f.readlines()
-        lines = [truncate(line.rstrip(), max_chars_per_line) for line in lines]
-
-    # Ensure indices are in range.
-    start_line = max(1, start_line)
-    end_line = min(len(lines), end_line)
-
-    return (lines[start_line - 1 : end_line], start_line)
+from llm_utils import llm_utils
 
 
 def complete(args, user_prompt, **kwargs):
@@ -188,7 +104,7 @@ def evaluate_text_prompt(args, prompt, wrap=True, **kwargs):
     completion = complete(args, prompt, **kwargs)
     text = completion.choices[0].message.content
     if wrap:
-        text = word_wrap_except_code_blocks(text)
+        text = llm_utils.word_wrap_except_code_blocks(text)
     return text
 
 
@@ -231,7 +147,6 @@ def evaluate_text_prompt(args, prompt, wrap=True, **kwargs):
 class explain_context:
     def __init__(self, args, diagnostic):
         self.args = args
-        self.encoding = tiktoken.encoding_for_model(args["llm"])
         self.diagnostic_lines = diagnostic.splitlines()
 
         # We group by source file.
@@ -254,7 +169,7 @@ def __init__(self, args, diagnostic):
                 continue
 
             try:
-                (abridged_code, line_start) = read_lines(
+                (abridged_code, line_start) = llm_utils.read_lines(
                     file_name, line_number - 7, line_number + 3
                 )
             except FileNotFoundError:
@@ -292,7 +207,7 @@ def build_diagnostic_string():
                 line = self.diagnostic_lines[n - i // 2 - 1]
                 list = back
             list.append(line)
-            count = len(self.encoding.encode(build_diagnostic_string()))
+            count = llm_utils.count_tokens(self.args["llm"], build_diagnostic_string())
             if count > self.args["max_error_tokens"]:
                 list.pop()
                 break
@@ -368,7 +283,10 @@ def format_file_locations(filename: str, lines: Dict[int, str]) -> str:
             for filename, lines in self.code_locations.items()
         ]
 
-        counts = [len(self.encoding.encode(x)) for x in formatted_file_locations]
+        counts = [
+            llm_utils.count_tokens(self.args["llm"], x)
+            for x in formatted_file_locations
+        ]
         index = 0
         total = 0
         while (