Add additional repo metadata to llm prompts.

eli64s · Oct 23, 2023 · 73355bc · 73355bc
1 parent ba5c62f
commit 73355bc
Show file tree

Hide file tree

Showing 12 changed files with 320 additions and 225 deletions.
diff --git a/.gitignore b/.gitignore
@@ -45,9 +45,10 @@ notebooks/
 .benchmarks/
 
 # Other
+templates/
 docs/docs
 docs/notes
+docs/flow.md
 examples/markdown/readme-edgecase.md
 readmeai/settings/prompts.toml
 readmeai/markdown/data/badges.json
-templates/
diff --git a/docs/features.md b/docs/features.md
@@ -1,4 +1,4 @@
-## Key Features
+## Features
 
 <br>
 <div>

diff --git a/docs/overview.md b/docs/overview.md
@@ -1,4 +1,7 @@
 # README-AI
 
+---
 
 ## Why README-AI?
+
+---
diff --git a/examples/markdown/readme-python.md b/examples/markdown/readme-python.md
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "readmeai"
-version = "0.4.036"
+version = "0.4.037"
 description = "Generate beautiful README.md files from the terminal, powered by AI."
 authors = ["Eli <0x.eli.64s@gmail.com>"]
 license = "MIT"

diff --git a/readmeai/config/settings.py b/readmeai/config/settings.py
@@ -126,10 +126,10 @@ class PathsConfig(BaseModel):
 class PromptsConfig(BaseModel):
     """Pydantic model for OpenAI prompts."""
 
-    code_summary: str
     features: str
     overview: str
     slogan: str
+    summaries: str
 
 
 class AppConfig(BaseModel):

diff --git a/readmeai/core/model.py b/readmeai/core/model.py
@@ -57,16 +57,20 @@ def __init__(self, config: settings.AppConfig):
         self.rate_limit_semaphore = asyncio.Semaphore(self.rate_limit)
 
     async def code_to_text(
-        self, ignore: dict, files: Dict[str, str], prompt: str
+        self,
+        files: Dict[str, str],
+        ignore: Dict[str, List[str]],
+        prompt: str,
+        tree: str,
     ) -> Dict[str, str]:
         """Converts code to natural language text using large language models.
 
         Parameters
         ----------
-        ignore : dict
-            Files, directories, or file extensions to ignore.
         files : Dict[str, str]
             The repository files to convert to text.
+        ignore : Dict[str, List[str]]
+            Files, directories, or file extensions to ignore.
         prompt : str
             The prompt to use for the OpenAI API calls.
 
@@ -88,7 +92,7 @@ async def code_to_text(
                 self.logger.warning(f"Ignoring file: {path}")
                 continue
 
-            prompt_code = prompt.format(str(path), contents)
+            prompt_code = prompt.format(tree, str(path), contents)
             tasks.append(
                 asyncio.create_task(
                     self.generate_text(path, prompt_code, self.tokens)
@@ -160,7 +164,10 @@ async def generate_text(
         try:
             token_count = get_token_count(prompt, self.encoding)
 
-            if token_count > tokens:
+            if token_count > self.tokens_max:
+                self.logger.warning(
+                    f"Truncating tokens: {token_count} > {self.tokens_max}"
+                )
                 prompt = truncate_tokens(prompt, tokens)
 
             async with self.rate_limit_semaphore:

diff --git a/readmeai/core/tokens.py b/readmeai/core/tokens.py
@@ -1,4 +1,4 @@
-"""Utilities for handling tokennization."""
+"""Utilities for handling language tokens."""
 
 from tiktoken import encoding_for_model, get_encoding
 

diff --git a/readmeai/main.py b/readmeai/main.py
@@ -7,8 +7,6 @@
 import asyncio
 import traceback
 
-import requests
-
 from readmeai.config.settings import (
     AppConfig,
     AppConfigModel,
@@ -20,6 +18,7 @@
 from readmeai.core import logger, model, preprocess
 from readmeai.markdown import headers, tables, tree
 from readmeai.services import version_control as vcs
+from readmeai.utils import utils
 
 logger = logger.Logger(__name__)
 
@@ -80,23 +79,25 @@ async def readme_agent(conf: AppConfig, conf_helper: ConfigHelper) -> None:
         parser = preprocess.RepositoryParser(conf, conf_helper)
         dependencies, files = parser.get_dependencies(temp_dir)
         logger.info(f"Dependencies: {dependencies}")
-        logger.info(f"Files: {files}")
 
         # Generate codebase file summaries and README.md text via LLMs.
         if conf.cli.offline is False:
             code_summary = await llm.code_to_text(
-                conf_helper.ignore_files,
                 files,
-                conf.prompts.code_summary,
+                conf_helper.ignore_files,
+                conf.prompts.summaries,
+                tree_str,
             )
-            logger.info(f"Code summaries returned:\n{code_summary[:5]}")
             prompts = [
                 conf.prompts.slogan.format(conf.git.name),
-                conf.prompts.overview.format(repository, code_summary),
-                conf.prompts.features.format(repository, tree),
+                conf.prompts.overview.format(
+                    repository, tree_str, dependencies, code_summary
+                ),
+                conf.prompts.features.format(
+                    repository, tree_str, dependencies, code_summary
+                ),
             ]
             slogan, overview, features = await llm.chat_to_text(prompts)
-
         else:
             conf.md.tables = tables.build_recursive_tables(
                 repository, temp_dir, placeholder

diff --git a/readmeai/settings/config.toml b/readmeai/settings/config.toml
@@ -4,9 +4,9 @@ endpoint = "https://api.openai.com/v1/chat/completions"
 encoding = "cl100k_base"
 model = "gpt-3.5-turbo"
 rate_limit = 3
-tokens = 650
-tokens_max = 3800
-temperature = 0.9
+tokens = 750
+tokens_max = 4000
+temperature = 1.0
 
 # Version Control Systems
 [base_urls]
@@ -38,12 +38,7 @@ output = "readme-ai.md"
 
 # Prompts
 [prompts]
-code_summary = """Offer a comprehensive summary that encapsulates the core functionalities of the code:
-\nPath: {0}\nContents:\n{1}\n Aim for precision and conciseness in your explanation, ensuring a fine balance between detail and brevity.
-Limit your response to a maximum of 225 characters (including spaces).
-"""
-features = """Hello! Analyze the Git codebase {} and create a robust summary of the project's features.
-The following information summarizes each file in the repository to help you get started: \n{}\n
+features = """Hello! Analyze the repository {0} and following the instructions below to generate a comprehensive list of features.
 Please provide a comprehensive technical analysis of the codebase and its components.
 Consider the codebase as a whole and highlight the key characteristics, design patterns, architectural decisions, and any other noteworthy elements.
 Generate your response as a Markdown table with the following columns:
@@ -55,21 +50,29 @@ Generate your response as a Markdown table with the following columns:
 | 🔗 | **Dependencies**   | Examine the external libraries or other systems that this system relies on here. Limit your response to a maximum of 200 characters.|
 | 🧩 | **Modularity**     | Discuss the system's organization into smaller, interchangeable components here. Limit your response to a maximum of 200 characters.|
 | 🧪 | **Testing**        | Evaluate the system's testing strategies and tools here. Limit your response to a maximum of 200 characters.       |
-| ⚡️ | **Performance**    | Analyze how well the system performs, considering speed, efficiency, and resource usage here. Limit your response to a maximum of 200 characters.|
+| ⚡️  | **Performance**    | Analyze how well the system performs, considering speed, efficiency, and resource usage here. Limit your response to a maximum of 200 characters.|
 | 🔐 | **Security**       | Assess the measures the system uses to protect data and maintain functionality here. Limit your response to a maximum of 200 characters.|
 | 🔀 | **Version Control**| Discuss the system's version control strategies and tools here. Limit your response to a maximum of 200 characters.|
 | 🔌 | **Integrations**   | Evaluate how the system interacts with other systems and services here. Limit your response to a maximum of 200 characters.|
 | 📶 | **Scalability**    | Analyze the system's ability to handle growth here. Limit your response to a maximum of 200 characters.           |
 
-Thank you for your time and effort!
+Repository Details:
+\nDirectory Tree: {1}\nDependencies: {2}\nCode Summaries: {3}\n
+"""
+overview = """Generate a <=100 word summary that describes the capabilities of the repository {0}.
+Focus on the project's use-case and value proposition, not its technical details.
+Do not refer to the project using the URL provided. Below are more details of the
+project for you can get a deep nderstanding of the codebase and its components.
+Repository Details:
+\nDirectory Tree: {1}\nDependencies: {2}\nCode Summaries: {3}\n
 """
-overview = """Please analyze the codebase located at {} and provide a robust, yet succinct overview of the rpoject.
-The following includes a list of the summaries of the files in the repository: \n{}\n
-Craft 3-4 sentences that encapsulate the core functionalities of the project, its purpose, and its value proposition.
+slogan = "Conceptualize a catchy and memorable slogan for the GitHub project: {0}. Limit your response to 80 characters."
+summaries = """Offer a comprehensive summary <= 80 words that encapsulates the core functionalities of the code below.
+Aim for precision and conciseness in your explanation, ensuring a fine balance between detail and brevity.
+\nDirectory Tree: {0}\nPath: {1}\nCode:\n{2}\n
 """
-slogan = "Conceptualize a catchy and memorable slogan for the GitHub project: {}. Limit your response to 80 characters."
 
-# Markdown Template Code
+# Markdown Templates
 [md]
 tables = ""
 default = "► INSERT-TEXT"

diff --git a/readmeai/settings/ignore_files.toml b/readmeai/settings/ignore_files.toml
@@ -139,4 +139,5 @@ files = [
     "__init__.py",
     "start",
     "test_binary",
+    "mkdocs.yml",
 ]
diff --git a/readmeai/utils/utils.py b/readmeai/utils/utils.py
@@ -12,17 +12,21 @@
 
 def should_ignore(conf_helper: ConfigHelper, file_path: Path) -> bool:
     """Filters out files that should be ignored."""
-    for directory in conf_helper.ignore_files["directories"]:
-        if directory in file_path.parts:
-            logger.debug(f"Ignoring directory: {file_path}")
-            return True
+    ignore_files = conf_helper.ignore_files
 
-    if file_path.name in conf_helper.ignore_files["files"]:
-        logger.debug(f"Ignoring file: {file_path}")
+    if any(
+        directory in file_path.parts
+        for directory in ignore_files["directories"]
+    ):
+        logger.debug(f"Ignoring directory: {file_path.name}")
         return True
 
-    if file_path.suffix[1:] in conf_helper.ignore_files["extensions"]:
-        logger.debug(f"Ignoring extension: {file_path}")
+    if file_path.name in ignore_files["files"]:
+        logger.debug(f"Ignoring file: {file_path.name}")
+        return True
+
+    if file_path.suffix.lstrip(".") in ignore_files["extensions"]:
+        logger.debug(f"Ignoring extension: {file_path.name}")
         return True
 
     return False