diff --git a/.gitignore b/.gitignore index 475480f..102e0a5 100644 --- a/.gitignore +++ b/.gitignore @@ -230,3 +230,4 @@ cython_debug/ testing/ devtale_demo/ notebooks/ +devtale-testing/ diff --git a/cli.py b/cli.py index ef6bc84..2a345db 100644 --- a/cli.py +++ b/cli.py @@ -16,7 +16,8 @@ get_unit_tale, prepare_code_elements, redact_tale_information, - split, + split_code, + split_text, ) DEFAULT_OUTPUT_PATH = "devtale_demo/" @@ -34,14 +35,30 @@ def process_repository( fuse: bool = False, ) -> None: folders = {} - folder_tales = [] + folder_tales = { + "repository_name": os.path.basename(os.path.abspath(root_path)), + "folders": [], + } + + # get project structure before we modify it + gitignore_path = os.path.join(root_path, ".gitignore") + if os.path.exists(gitignore_path): + with open(gitignore_path, "r") as gitignore_file: + gitignore_patterns = [ + line.strip() for line in gitignore_file if line.strip() + ] + else: + gitignore_patterns = None + + project_tree = build_project_tree(root_path, gitignore_patterns=gitignore_patterns) + project_tree = ".\n" + project_tree + for folder_path, _, filenames in os.walk(root_path): for filename in filenames: file_relative_path = os.path.relpath( os.path.join(folder_path, filename), root_path ) folder_name, file_name = os.path.split(file_relative_path) - # useful to keep a tree, we should use .gitignore to filter if folder_name not in folders: folders[folder_name] = [file_name] else: @@ -51,43 +68,37 @@ def process_repository( folder_path = os.path.join(root_path, folder_name) folder_tale = process_folder(folder_path, output_path, model_name, fuse) if folder_tale is not None: - is_root_folder = False + # add root folder summary information if folder_name == root_path or folder_name == "": - folder_name = os.path.basename(os.path.abspath(root_path)) - is_root_folder = True - folder_tales.append( - { - "folder_name": folder_name, - "folder_summary": folder_tale, - "is_root_folder": is_root_folder, - } - ) + folder_tales["folders"].append( + { + "folder_name": os.path.basename(os.path.abspath(root_path)), + "folder_summary": folder_tale, + "is_root_folder": True, + } + ) + else: + folder_tales["folders"].append( + { + "folder_name": os.path.basename(folder_name), + "folder_summary": folder_tale, + } + ) if folder_tales: - root_readme = redact_tale_information("root-level", folder_tales) - - # get project structure - gitignore_path = os.path.join(root_path, ".gitignore") - if os.path.exists(gitignore_path): - with open(gitignore_path, "r") as gitignore_file: - gitignore_patterns = [ - line.strip() for line in gitignore_file if line.strip() - ] - else: - gitignore_patterns = None - - project_tree = build_project_tree( - root_path, gitignore_patterns=gitignore_patterns - ) - project_tree = ".\n" + project_tree + folder_summaries = split_text(str(folder_tales), chunk_size=15000) + root_readme = redact_tale_information( + "root-level", folder_summaries, model_name="gpt-3.5-turbo-16k" + )["text"] # inject project tree tree = f"\n\n## Project Tree\n```bash\n{project_tree}```\n\n" root_readme = root_readme + tree - save_path = os.path.join(output_path, os.path.basename(root_path)) - logger.info(f"saving root index in {save_path}") - with open(os.path.join(save_path, "README.md"), "w", encoding="utf-8") as file: + logger.info(f"saving root index in {output_path}") + with open( + os.path.join(output_path, "README.md"), "w", encoding="utf-8" + ) as file: file.write(root_readme) @@ -119,7 +130,13 @@ def process_folder( ) if tales: - folder_readme = redact_tale_information("folder-level", tales) + files_summaries = split_text(str(tales), chunk_size=15000) + folder_info = redact_tale_information( + "folder-level", files_summaries, model_name="gpt-3.5-turbo-16k" + ) + folder_readme = folder_info["folder_readme"].replace("----------", "") + folder_tale = folder_info["folder_overview"] + if not os.path.exists(save_path): os.makedirs(save_path) @@ -127,7 +144,7 @@ def process_folder( with open(os.path.join(save_path, "README.md"), "w", encoding="utf-8") as file: file.write(folder_readme) - return folder_readme + return folder_tale return None @@ -152,8 +169,8 @@ def process_file( return {"file_docstring": ""} logger.info("split dev draft ideas") - big_docs = split(code, language=LANGUAGES[file_ext], chunk_size=10000) - short_docs = split(code, language=LANGUAGES[file_ext], chunk_size=3000) + big_docs = split_code(code, language=LANGUAGES[file_ext], chunk_size=10000) + short_docs = split_code(code, language=LANGUAGES[file_ext], chunk_size=3000) logger.info("extract code elements") code_elements = [] @@ -188,7 +205,8 @@ def process_file( tale = fuse_tales(tales_list, code, code_elements_dict) logger.info("add dev tale summary") - tale["file_docstring"] = redact_tale_information("top-level", code_elements_dict) + summaries = split_text(str(code_elements_dict["summary"]), chunk_size=9000) + tale["file_docstring"] = redact_tale_information("top-level", summaries)["text"] save_path = os.path.join(output_path, f"{file_name}.json") logger.info(f"save dev tale in: {save_path}") @@ -197,7 +215,7 @@ def process_file( if fuse: save_path = os.path.join(output_path, file_name) - logger.info(f"fuse dev tale in code file {save_path}") + logger.info(f"save fused dev tale in: {save_path}") if file_ext == ".py": aggregator = PythonAggregator() diff --git a/devtale/templates.py b/devtale/templates.py index 7336f7b..00924ce 100644 --- a/devtale/templates.py +++ b/devtale/templates.py @@ -41,15 +41,14 @@ FILE_LEVEL_TEMPLATE = """ -The provided summaries belong to the same code file and have been \ -processed by dividing the code into sections. Utilize these summaries \ -to create a comprehensive final summary that encapsulates the purpose \ -of the file. +The following summaries enclosed within the <<< >>> delimeters are derived from the \ +same code file. Write a top-file level docstring that combines them into a concise \ +final summary that effectively captures the overall purpose and functionality of the \ +entire code file. -Summaries: ----------- - {information} ----------- +Summaries: <<< {information} >>> + +Ensure your final summary is no longer than three sentences. """ @@ -60,48 +59,63 @@ Folder information: {information} Structure: ------------ +---------- # <<>> (Always capitalize the initial letter) ## Overview -This section provides an overview of the folder's purpose \ +(This section provides an overview of the folder's purpose \ and objectives by understanding all the file summaries that \ -belong to the same folder. +belong to the same folder.) ## Files -Here is a list of files contained within this folder, accompanied \ -by concise one-line sentence description of their functionality: +(Here is a list of files contained within this folder, accompanied \ +by concise one-line sentence description of their functionality) -- ** <<>> **: One-line sentence description of the file -functionality. +- ** <<>> **: Concise one-line summary of the file's \ +operational purpose. [//]: # (Repeat the above section for each file_name in the list) For detailed insights into each file, refer to their respective \ sections. If you have inquiries or need assistance, contact the contributors. ------------ +---------- Ensure proper formatting and adhere to Markdown syntax guidelines. +Output your answer as a JSON with the keys: folder_overview, folder_readme """ - ROOT_LEVEL_TEMPLATE = """ -Generate the root README content using the provided readme information \ -enclosed within the <<< >>> delimiters. +Generate a markdown text using the enclosed \ +information within the <<< >>> delimiters as your context. \ +Your output must strictly follow the provided structure below \ +without adding any other section. -1- Extract the project name from the root folder name for the title. -2- Write a summary overview based on the READMEs from all the folders. +This is the structure your output should have: +Structure: +---------- +# <<>> (Please ensure that the initial letter \ +is capitalized) + +## Description +(Provide a concise one-line sentence that describes the primary \ +purpose of the code, utilizing all the contextual details \ +available.) -Please ensure that the generated README adheres to Markdown syntax guidelines \ -and includes the following sections: +## Overview +(In this section, your task is to create a single, well-structured \ +paragraph that concisely communicates the reasons behind the \ +repository's creation, its objectives, and the mechanics underlying \ +its functionality.) + +## Scripts +(Enumerate the names of root CLI files. Include a one-line sentence \ +description for each file, detailing its intended purpose. If \ +there are no relevant files, omit this section entirely. +---------- --Title (based on the root folder name) --Description (one-line sentence of what the code does based on all the \ -information). --Overview (overview based on folder summaries) --Scripts (List of root CLI files with one-sentence description of \ -its purpose, if any, otherwise do not display this section). +Repository information: <<< {information} >>> -Here is readme information: <<< {information} >>> +Ensure proper formatting and adhere to Markdown syntax guidelines. +Do not add sections that are not listed in the provided structure. """ diff --git a/devtale/utils.py b/devtale/utils.py index 057839d..b4a40a2 100644 --- a/devtale/utils.py +++ b/devtale/utils.py @@ -25,9 +25,17 @@ } -def split(code, language, chunk_size=1000, chunk_overlap=0): +def split_text(text, chunk_size=1000, chunk_overlap=0): + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + docs = text_splitter.create_documents([text]) + return docs + + +def split_code(code, language, chunk_size=1000, chunk_overlap=0): code_splitter = RecursiveCharacterTextSplitter.from_language( - language=language, chunk_size=chunk_size, chunk_overlap=0 + language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap ) docs = code_splitter.create_documents([code]) return docs @@ -83,34 +91,35 @@ def prepare_code_elements(code_elements): return elements -def redact_tale_information(content_type, information, verbose=False): +def redact_tale_information( + content_type, docs, verbose=False, model_name="text-davinci-003" +): prompt = PromptTemplate( template=TYPE_INFORMATION[content_type], input_variables=["information"] ) - teller_of_tales = LLMChain(llm=OpenAI(), prompt=prompt, verbose=verbose) + teller_of_tales = LLMChain( + llm=OpenAI(model_name=model_name), prompt=prompt, verbose=verbose + ) + information = str(docs[0].page_content) - return teller_of_tales.run(str(information)) + text_answer = teller_of_tales({"information": information}) + if content_type == "folder-level": + json_answer = convert_to_json(text_answer) + if not json_answer: + print("Returning empty JSON due to a failure") + json_answer = {"folder_overview": "", "folder_readme": ""} + return json_answer -def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False): - parser = PydanticOutputParser(pydantic_object=FileDocumentation) - prompt = PromptTemplate( - template=CODE_LEVEL_TEMPLATE, - input_variables=["code", "code_elements"], - partial_variables={"format_instructions": parser.get_format_instructions()}, - ) - teller_of_tales = LLMChain( - llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose - ) + return text_answer - result_string = teller_of_tales( - {"code": short_doc.page_content, "code_elements": code_elements} - ) + +def convert_to_json(text_answer): try: - result_json = json.loads(result_string["text"]) + result_json = json.loads(text_answer["text"]) except JSONDecodeError: try: - text = result_string["text"].replace("\\n", "\n") + text = text_answer["text"].replace("\\n", "\n") start_index = text.find("{") end_index = text.rfind("}") @@ -122,15 +131,34 @@ def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False): except Exception as e: print( - f"Error getting the JSON with the docstrings. \ - Error: {e} \n Result: {result_string['text']}" + f"Error getting the JSON. \ + Error: {e} \n Result: {text_answer['text']}" ) - print("Returning empty JSON instead") - empty = {"classes": [], "methods": []} - return empty + return None return result_json +def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False): + parser = PydanticOutputParser(pydantic_object=FileDocumentation) + prompt = PromptTemplate( + template=CODE_LEVEL_TEMPLATE, + input_variables=["code", "code_elements"], + partial_variables={"format_instructions": parser.get_format_instructions()}, + ) + teller_of_tales = LLMChain( + llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose + ) + + result_string = teller_of_tales( + {"code": short_doc.page_content, "code_elements": code_elements} + ) + json_answer = convert_to_json(result_string) + if not json_answer: + print("Returning empty JSON due to a failure") + json_answer = {"classes": [], "methods": []} + return json_answer + + def is_hallucination(code_definition, code, expected_definitions): # Verify that the code_definition is expected if code_definition not in expected_definitions: