tenxstudio · betogaona7 · Aug 21, 2023 · Aug 21, 2023 · Aug 21, 2023 · Aug 21, 2023
diff --git a/.gitignore b/.gitignore
@@ -230,3 +230,4 @@ cython_debug/
 testing/
 devtale_demo/
 notebooks/
+devtale-testing/
diff --git a/cli.py b/cli.py
@@ -16,7 +16,8 @@
     get_unit_tale,
     prepare_code_elements,
     redact_tale_information,
-    split,
+    split_code,
+    split_text,
 )
 
 DEFAULT_OUTPUT_PATH = "devtale_demo/"
@@ -34,14 +35,30 @@ def process_repository(
     fuse: bool = False,
 ) -> None:
     folders = {}
-    folder_tales = []
+    folder_tales = {
+        "repository_name": os.path.basename(os.path.abspath(root_path)),
+        "folders": [],
+    }
+
+    # get project structure before we modify it
+    gitignore_path = os.path.join(root_path, ".gitignore")
+    if os.path.exists(gitignore_path):
+        with open(gitignore_path, "r") as gitignore_file:
+            gitignore_patterns = [
+                line.strip() for line in gitignore_file if line.strip()
+            ]
+    else:
+        gitignore_patterns = None
+
+    project_tree = build_project_tree(root_path, gitignore_patterns=gitignore_patterns)
+    project_tree = ".\n" + project_tree
+
     for folder_path, _, filenames in os.walk(root_path):
         for filename in filenames:
             file_relative_path = os.path.relpath(
                 os.path.join(folder_path, filename), root_path
             )
             folder_name, file_name = os.path.split(file_relative_path)
-            # useful to keep a tree, we should use .gitignore to filter
             if folder_name not in folders:
                 folders[folder_name] = [file_name]
             else:
@@ -51,43 +68,37 @@ def process_repository(
         folder_path = os.path.join(root_path, folder_name)
         folder_tale = process_folder(folder_path, output_path, model_name, fuse)
         if folder_tale is not None:
-            is_root_folder = False
+            # add root folder summary information
             if folder_name == root_path or folder_name == "":
-                folder_name = os.path.basename(os.path.abspath(root_path))
-                is_root_folder = True
-            folder_tales.append(
-                {
-                    "folder_name": folder_name,
-                    "folder_summary": folder_tale,
-                    "is_root_folder": is_root_folder,
-                }
-            )
+                folder_tales["folders"].append(
+                    {
+                        "folder_name": os.path.basename(os.path.abspath(root_path)),
+                        "folder_summary": folder_tale,
+                        "is_root_folder": True,
+                    }
+                )
+            else:
+                folder_tales["folders"].append(
+                    {
+                        "folder_name": os.path.basename(folder_name),
+                        "folder_summary": folder_tale,
+                    }
+                )
 
     if folder_tales:
-        root_readme = redact_tale_information("root-level", folder_tales)
-
-        # get project structure
-        gitignore_path = os.path.join(root_path, ".gitignore")
-        if os.path.exists(gitignore_path):
-            with open(gitignore_path, "r") as gitignore_file:
-                gitignore_patterns = [
-                    line.strip() for line in gitignore_file if line.strip()
-                ]
-        else:
-            gitignore_patterns = None
-
-        project_tree = build_project_tree(
-            root_path, gitignore_patterns=gitignore_patterns
-        )
-        project_tree = ".\n" + project_tree
+        folder_summaries = split_text(str(folder_tales), chunk_size=15000)
+        root_readme = redact_tale_information(
+            "root-level", folder_summaries, model_name="gpt-3.5-turbo-16k"
+        )["text"]
 
         # inject project tree
         tree = f"\n\n## Project Tree\n```bash\n{project_tree}```\n\n"
         root_readme = root_readme + tree
 
-        save_path = os.path.join(output_path, os.path.basename(root_path))
-        logger.info(f"saving root index in {save_path}")
-        with open(os.path.join(save_path, "README.md"), "w", encoding="utf-8") as file:
+        logger.info(f"saving root index in {output_path}")
+        with open(
+            os.path.join(output_path, "README.md"), "w", encoding="utf-8"
+        ) as file:
             file.write(root_readme)
 
 
@@ -119,15 +130,21 @@ def process_folder(
                 )
 
     if tales:
-        folder_readme = redact_tale_information("folder-level", tales)
+        files_summaries = split_text(str(tales), chunk_size=15000)
+        folder_info = redact_tale_information(
+            "folder-level", files_summaries, model_name="gpt-3.5-turbo-16k"
+        )
+        folder_readme = folder_info["folder_readme"].replace("----------", "")
+        folder_tale = folder_info["folder_overview"]
+
         if not os.path.exists(save_path):
             os.makedirs(save_path)
 
         logger.info(f"saving index in {save_path}")
         with open(os.path.join(save_path, "README.md"), "w", encoding="utf-8") as file:
             file.write(folder_readme)
 
-        return folder_readme
+        return folder_tale
     return None
 
 
@@ -152,8 +169,8 @@ def process_file(
         return {"file_docstring": ""}
 
     logger.info("split dev draft ideas")
-    big_docs = split(code, language=LANGUAGES[file_ext], chunk_size=10000)
-    short_docs = split(code, language=LANGUAGES[file_ext], chunk_size=3000)
+    big_docs = split_code(code, language=LANGUAGES[file_ext], chunk_size=10000)
+    short_docs = split_code(code, language=LANGUAGES[file_ext], chunk_size=3000)
 
     logger.info("extract code elements")
     code_elements = []
@@ -188,7 +205,8 @@ def process_file(
     tale = fuse_tales(tales_list, code, code_elements_dict)
 
     logger.info("add dev tale summary")
-    tale["file_docstring"] = redact_tale_information("top-level", code_elements_dict)
+    summaries = split_text(str(code_elements_dict["summary"]), chunk_size=9000)
+    tale["file_docstring"] = redact_tale_information("top-level", summaries)["text"]
 
     save_path = os.path.join(output_path, f"{file_name}.json")
     logger.info(f"save dev tale in: {save_path}")
@@ -197,7 +215,7 @@ def process_file(
 
     if fuse:
         save_path = os.path.join(output_path, file_name)
-        logger.info(f"fuse dev tale in code file {save_path}")
+        logger.info(f"save fused dev tale in: {save_path}")
 
         if file_ext == ".py":
             aggregator = PythonAggregator()

diff --git a/devtale/templates.py b/devtale/templates.py
@@ -41,15 +41,14 @@
 
 
 FILE_LEVEL_TEMPLATE = """
-The provided summaries belong to the same code file and have been \
-processed by dividing the code into sections. Utilize these summaries \
-to create a comprehensive final summary that encapsulates the purpose \
-of the file.
+The following summaries enclosed within the <<< >>> delimeters are derived from the \
+same code file. Write a top-file level docstring that combines them into a concise  \
+final summary that effectively captures the overall purpose and functionality of the \
+entire code file.
 
-Summaries:
-----------
- {information}
-----------
+Summaries: <<< {information} >>>
+
+Ensure your final summary is no longer than three sentences.
 """
 
 
@@ -60,48 +59,63 @@
 Folder information: {information}
 
 Structure:
------------
+----------
 # <<<folder_name>>> (Always capitalize the initial letter)
 
 ## Overview
-This section provides an overview of the folder's purpose \
+(This section provides an overview of the folder's purpose \
 and objectives by understanding all the file summaries that \
-belong to the same folder.
+belong to the same folder.)
 
 ## Files
-Here is a list of files contained within this folder, accompanied \
-by concise one-line sentence description of their functionality:
+(Here is a list of files contained within this folder, accompanied \
+by concise one-line sentence description of their functionality)
 
-- ** <<<file_name>>> **: One-line sentence description of the file
-functionality.
+- ** <<<file_name>>> **: Concise one-line summary of the file's \
+operational purpose.
 
 [//]: # (Repeat the above section for each file_name in the list)
 
 For detailed insights into each file, refer to their respective \
 sections.
 If you have inquiries or need assistance, contact the contributors.
------------
+----------
 
 Ensure proper formatting and adhere to Markdown syntax guidelines.
+Output your answer as a JSON with the keys: folder_overview, folder_readme
 """
 
-
 ROOT_LEVEL_TEMPLATE = """
-Generate the root README content using the provided readme information \
-enclosed within the <<< >>> delimiters.
+Generate a markdown text using the enclosed \
+information within the <<< >>> delimiters as your context. \
+Your output must strictly follow the provided structure below \
+without adding any other section.
 
-1- Extract the project name from the root folder name for the title.
-2- Write a summary overview based on the READMEs from all the folders.
+This is the structure your output should have:
+Structure:
+----------
+# <<<repository_name>>> (Please ensure that the initial letter \
+is capitalized)
+
+## Description
+(Provide a concise one-line sentence that describes the primary \
+purpose of the code, utilizing all the contextual details \
+available.)
 
-Please ensure that the generated README adheres to Markdown syntax guidelines \
-and includes the following sections:
+## Overview
+(In this section, your task is to create a single, well-structured \
+paragraph that concisely communicates the reasons behind the \
+repository's creation, its objectives, and the mechanics underlying \
+its functionality.)
+
+## Scripts
+(Enumerate the names of root CLI files. Include a one-line sentence \
+description for each file, detailing its intended purpose. If \
+there are no relevant files, omit this section entirely.
+----------
 
--Title (based on the root folder name)
--Description (one-line sentence of what the code does based on all the \
-information).
--Overview (overview based on folder summaries)
--Scripts (List of root CLI files with one-sentence description of \
-its purpose, if any, otherwise do not display this section).
+Repository information: <<< {information} >>>
 
-Here is readme information: <<< {information} >>>
+Ensure proper formatting and adhere to Markdown syntax guidelines.
+Do not add sections that are not listed in the provided structure.
 """
diff --git a/devtale/utils.py b/devtale/utils.py
@@ -25,9 +25,17 @@
 }
 
 
-def split(code, language, chunk_size=1000, chunk_overlap=0):
+def split_text(text, chunk_size=1000, chunk_overlap=0):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=chunk_overlap
+    )
+    docs = text_splitter.create_documents([text])
+    return docs
+
+
+def split_code(code, language, chunk_size=1000, chunk_overlap=0):
     code_splitter = RecursiveCharacterTextSplitter.from_language(
-        language=language, chunk_size=chunk_size, chunk_overlap=0
+        language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap
     )
     docs = code_splitter.create_documents([code])
     return docs
@@ -83,34 +91,35 @@ def prepare_code_elements(code_elements):
     return elements
 
 
-def redact_tale_information(content_type, information, verbose=False):
+def redact_tale_information(
+    content_type, docs, verbose=False, model_name="text-davinci-003"
+):
     prompt = PromptTemplate(
         template=TYPE_INFORMATION[content_type], input_variables=["information"]
     )
-    teller_of_tales = LLMChain(llm=OpenAI(), prompt=prompt, verbose=verbose)
+    teller_of_tales = LLMChain(
+        llm=OpenAI(model_name=model_name), prompt=prompt, verbose=verbose
+    )
+    information = str(docs[0].page_content)
 
-    return teller_of_tales.run(str(information))
+    text_answer = teller_of_tales({"information": information})
 
+    if content_type == "folder-level":
+        json_answer = convert_to_json(text_answer)
+        if not json_answer:
+            print("Returning empty JSON due to a failure")
+            json_answer = {"folder_overview": "", "folder_readme": ""}
+        return json_answer
 
-def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False):
-    parser = PydanticOutputParser(pydantic_object=FileDocumentation)
-    prompt = PromptTemplate(
-        template=CODE_LEVEL_TEMPLATE,
-        input_variables=["code", "code_elements"],
-        partial_variables={"format_instructions": parser.get_format_instructions()},
-    )
-    teller_of_tales = LLMChain(
-        llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose
-    )
+    return text_answer
 
-    result_string = teller_of_tales(
-        {"code": short_doc.page_content, "code_elements": code_elements}
-    )
+
+def convert_to_json(text_answer):
     try:
-        result_json = json.loads(result_string["text"])
+        result_json = json.loads(text_answer["text"])
     except JSONDecodeError:
         try:
-            text = result_string["text"].replace("\\n", "\n")
+            text = text_answer["text"].replace("\\n", "\n")
             start_index = text.find("{")
             end_index = text.rfind("}")
 
@@ -122,15 +131,34 @@ def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False):
 
         except Exception as e:
             print(
-                f"Error getting the JSON with the docstrings. \
-                Error: {e} \n Result: {result_string['text']}"
+                f"Error getting the JSON. \
+                Error: {e} \n Result: {text_answer['text']}"
             )
-            print("Returning empty JSON instead")
-            empty = {"classes": [], "methods": []}
-            return empty
+            return None
     return result_json
 
 
+def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False):
+    parser = PydanticOutputParser(pydantic_object=FileDocumentation)
+    prompt = PromptTemplate(
+        template=CODE_LEVEL_TEMPLATE,
+        input_variables=["code", "code_elements"],
+        partial_variables={"format_instructions": parser.get_format_instructions()},
+    )
+    teller_of_tales = LLMChain(
+        llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose
+    )
+
+    result_string = teller_of_tales(
+        {"code": short_doc.page_content, "code_elements": code_elements}
+    )
+    json_answer = convert_to_json(result_string)
+    if not json_answer:
+        print("Returning empty JSON due to a failure")
+        json_answer = {"classes": [], "methods": []}
+    return json_answer
+
+
 def is_hallucination(code_definition, code, expected_definitions):
     # Verify that the code_definition is expected
     if code_definition not in expected_definitions: