diff --git a/cli.py b/cli.py index 3840409..a489717 100644 --- a/cli.py +++ b/cli.py @@ -13,7 +13,12 @@ PHPAggregator, PythonAggregator, ) -from devtale.constants import ALLOWED_EXTENSIONS, DOCSTRING_LABEL, LANGUAGES +from devtale.constants import ( + ALLOWED_EXTENSIONS, + ALLOWED_NO_CODE_EXTENSIONS, + DOCSTRING_LABEL, + LANGUAGES, +) from devtale.utils import ( build_project_tree, extract_code_elements, @@ -178,9 +183,9 @@ def process_folder( for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) - if ( - os.path.isfile(file_path) - and os.path.splitext(file_name)[1] in ALLOWED_EXTENSIONS + if os.path.isfile(file_path) and ( + os.path.splitext(file_name)[1] in ALLOWED_EXTENSIONS + or os.path.splitext(file_name)[1] in ALLOWED_NO_CODE_EXTENSIONS ): logger.info(f"processing {file_path}") try: @@ -299,9 +304,14 @@ def process_file( fuse_documentation(code, found_tale, output_path, file_name, file_ext) return found_tale - if not file_ext: - unknown_file_data = {"file_name": file_name, "file_content": code} - file_docstring = redact_tale_information("unknow-top-level", unknown_file_data)[ + if not file_ext or file_ext in ALLOWED_NO_CODE_EXTENSIONS: + # a small single chunk is enough + no_code_file = split_text(code, chunk_size=5000)[0].page_content + no_code_file_data = { + "file_name": file_name, + "file_content": no_code_file, + } + file_docstring = redact_tale_information("no-code-file", no_code_file_data)[ "text" ] return {"file_docstring": file_docstring} diff --git a/devtale/constants.py b/devtale/constants.py index c769d9c..15c3b82 100644 --- a/devtale/constants.py +++ b/devtale/constants.py @@ -1,7 +1,8 @@ from langchain.text_splitter import Language # we are only documenting the file that ends with the following extensions: -ALLOWED_EXTENSIONS = [".js", ".go", ".php", ".py", ""] +ALLOWED_EXTENSIONS = [".js", ".go", ".php", ".py"] +ALLOWED_NO_CODE_EXTENSIONS = ["", ".sh", ".xml", ".yaml", ".yml"] # split code files according the programming language LANGUAGES = { diff --git a/devtale/templates.py b/devtale/templates.py index fdf5a01..9005928 100644 --- a/devtale/templates.py +++ b/devtale/templates.py @@ -39,7 +39,7 @@ Input: <<< {code} >>> """ -UNKNOWN_FILE_LEVEL_TEMPLATE = """ +NO_CODE_FILE_TEMPLATE = """ Using the following file data enclosed within the <<< >>> delimeters write a \ top-file level concise summary that effectively captures the overall purpose and \ functionality of the file. diff --git a/devtale/utils.py b/devtale/utils.py index 16bc198..afd303f 100644 --- a/devtale/utils.py +++ b/devtale/utils.py @@ -17,15 +17,15 @@ FILE_LEVEL_TEMPLATE, FOLDER_LEVEL_TEMPLATE, FOLDER_SHORT_DESCRIPTION_TEMPLATE, + NO_CODE_FILE_TEMPLATE, ROOT_LEVEL_TEMPLATE, - UNKNOWN_FILE_LEVEL_TEMPLATE, ) TYPE_INFORMATION = { "top-level": FILE_LEVEL_TEMPLATE, "folder-level": FOLDER_LEVEL_TEMPLATE, "root-level": ROOT_LEVEL_TEMPLATE, - "unknow-top-level": UNKNOWN_FILE_LEVEL_TEMPLATE, + "no-code-file": NO_CODE_FILE_TEMPLATE, "folder-description": FOLDER_SHORT_DESCRIPTION_TEMPLATE, } @@ -105,7 +105,7 @@ def redact_tale_information( teller_of_tales = LLMChain( llm=OpenAI(model_name=model_name), prompt=prompt, verbose=verbose ) - if content_type not in ["unknow-top-level", "folder-description"]: + if content_type not in ["no-code-file", "folder-description"]: information = str(docs[0].page_content) else: information = str(docs)