Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle token size #37

Merged
merged 8 commits into from
Aug 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,4 @@ cython_debug/
testing/
devtale_demo/
notebooks/
devtale-testing/
94 changes: 56 additions & 38 deletions cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
get_unit_tale,
prepare_code_elements,
redact_tale_information,
split,
split_code,
split_text,
)

DEFAULT_OUTPUT_PATH = "devtale_demo/"
Expand All @@ -34,14 +35,30 @@ def process_repository(
fuse: bool = False,
) -> None:
folders = {}
folder_tales = []
folder_tales = {
"repository_name": os.path.basename(os.path.abspath(root_path)),
"folders": [],
}

# get project structure before we modify it
gitignore_path = os.path.join(root_path, ".gitignore")
if os.path.exists(gitignore_path):
with open(gitignore_path, "r") as gitignore_file:
gitignore_patterns = [
line.strip() for line in gitignore_file if line.strip()
]
else:
gitignore_patterns = None

project_tree = build_project_tree(root_path, gitignore_patterns=gitignore_patterns)
project_tree = ".\n" + project_tree

for folder_path, _, filenames in os.walk(root_path):
for filename in filenames:
file_relative_path = os.path.relpath(
os.path.join(folder_path, filename), root_path
)
folder_name, file_name = os.path.split(file_relative_path)
# useful to keep a tree, we should use .gitignore to filter
if folder_name not in folders:
folders[folder_name] = [file_name]
else:
Expand All @@ -51,43 +68,37 @@ def process_repository(
folder_path = os.path.join(root_path, folder_name)
folder_tale = process_folder(folder_path, output_path, model_name, fuse)
if folder_tale is not None:
is_root_folder = False
# add root folder summary information
if folder_name == root_path or folder_name == "":
folder_name = os.path.basename(os.path.abspath(root_path))
is_root_folder = True
folder_tales.append(
{
"folder_name": folder_name,
"folder_summary": folder_tale,
"is_root_folder": is_root_folder,
}
)
folder_tales["folders"].append(
{
"folder_name": os.path.basename(os.path.abspath(root_path)),
"folder_summary": folder_tale,
"is_root_folder": True,
}
)
else:
folder_tales["folders"].append(
{
"folder_name": os.path.basename(folder_name),
"folder_summary": folder_tale,
}
)

if folder_tales:
root_readme = redact_tale_information("root-level", folder_tales)

# get project structure
gitignore_path = os.path.join(root_path, ".gitignore")
if os.path.exists(gitignore_path):
with open(gitignore_path, "r") as gitignore_file:
gitignore_patterns = [
line.strip() for line in gitignore_file if line.strip()
]
else:
gitignore_patterns = None

project_tree = build_project_tree(
root_path, gitignore_patterns=gitignore_patterns
)
project_tree = ".\n" + project_tree
folder_summaries = split_text(str(folder_tales), chunk_size=15000)
root_readme = redact_tale_information(
"root-level", folder_summaries, model_name="gpt-3.5-turbo-16k"
)["text"]

# inject project tree
tree = f"\n\n## Project Tree\n```bash\n{project_tree}```\n\n"
root_readme = root_readme + tree

save_path = os.path.join(output_path, os.path.basename(root_path))
logger.info(f"saving root index in {save_path}")
with open(os.path.join(save_path, "README.md"), "w", encoding="utf-8") as file:
logger.info(f"saving root index in {output_path}")
with open(
os.path.join(output_path, "README.md"), "w", encoding="utf-8"
) as file:
file.write(root_readme)


Expand Down Expand Up @@ -119,15 +130,21 @@ def process_folder(
)

if tales:
folder_readme = redact_tale_information("folder-level", tales)
files_summaries = split_text(str(tales), chunk_size=15000)
folder_info = redact_tale_information(
"folder-level", files_summaries, model_name="gpt-3.5-turbo-16k"
)
folder_readme = folder_info["folder_readme"].replace("----------", "")
folder_tale = folder_info["folder_overview"]

if not os.path.exists(save_path):
os.makedirs(save_path)

logger.info(f"saving index in {save_path}")
with open(os.path.join(save_path, "README.md"), "w", encoding="utf-8") as file:
file.write(folder_readme)

return folder_readme
return folder_tale
return None


Expand All @@ -152,8 +169,8 @@ def process_file(
return {"file_docstring": ""}

logger.info("split dev draft ideas")
big_docs = split(code, language=LANGUAGES[file_ext], chunk_size=10000)
short_docs = split(code, language=LANGUAGES[file_ext], chunk_size=3000)
big_docs = split_code(code, language=LANGUAGES[file_ext], chunk_size=10000)
short_docs = split_code(code, language=LANGUAGES[file_ext], chunk_size=3000)

logger.info("extract code elements")
code_elements = []
Expand Down Expand Up @@ -188,7 +205,8 @@ def process_file(
tale = fuse_tales(tales_list, code, code_elements_dict)

logger.info("add dev tale summary")
tale["file_docstring"] = redact_tale_information("top-level", code_elements_dict)
summaries = split_text(str(code_elements_dict["summary"]), chunk_size=9000)
tale["file_docstring"] = redact_tale_information("top-level", summaries)["text"]

save_path = os.path.join(output_path, f"{file_name}.json")
logger.info(f"save dev tale in: {save_path}")
Expand All @@ -197,7 +215,7 @@ def process_file(

if fuse:
save_path = os.path.join(output_path, file_name)
logger.info(f"fuse dev tale in code file {save_path}")
logger.info(f"save fused dev tale in: {save_path}")

if file_ext == ".py":
aggregator = PythonAggregator()
Expand Down
74 changes: 44 additions & 30 deletions devtale/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,14 @@


FILE_LEVEL_TEMPLATE = """
The provided summaries belong to the same code file and have been \
processed by dividing the code into sections. Utilize these summaries \
to create a comprehensive final summary that encapsulates the purpose \
of the file.
The following summaries enclosed within the <<< >>> delimeters are derived from the \
same code file. Write a top-file level docstring that combines them into a concise \
final summary that effectively captures the overall purpose and functionality of the \
entire code file.

Summaries:
----------
{information}
----------
Summaries: <<< {information} >>>

Ensure your final summary is no longer than three sentences.
"""


Expand All @@ -60,48 +59,63 @@
Folder information: {information}

Structure:
-----------
----------
# <<<folder_name>>> (Always capitalize the initial letter)

## Overview
This section provides an overview of the folder's purpose \
(This section provides an overview of the folder's purpose \
and objectives by understanding all the file summaries that \
belong to the same folder.
belong to the same folder.)

## Files
Here is a list of files contained within this folder, accompanied \
by concise one-line sentence description of their functionality:
(Here is a list of files contained within this folder, accompanied \
by concise one-line sentence description of their functionality)

- ** <<<file_name>>> **: One-line sentence description of the file
functionality.
- ** <<<file_name>>> **: Concise one-line summary of the file's \
operational purpose.

[//]: # (Repeat the above section for each file_name in the list)

For detailed insights into each file, refer to their respective \
sections.
If you have inquiries or need assistance, contact the contributors.
-----------
----------

Ensure proper formatting and adhere to Markdown syntax guidelines.
Output your answer as a JSON with the keys: folder_overview, folder_readme
"""


ROOT_LEVEL_TEMPLATE = """
Generate the root README content using the provided readme information \
enclosed within the <<< >>> delimiters.
Generate a markdown text using the enclosed \
information within the <<< >>> delimiters as your context. \
Your output must strictly follow the provided structure below \
without adding any other section.

1- Extract the project name from the root folder name for the title.
2- Write a summary overview based on the READMEs from all the folders.
This is the structure your output should have:
Structure:
----------
# <<<repository_name>>> (Please ensure that the initial letter \
is capitalized)

## Description
(Provide a concise one-line sentence that describes the primary \
purpose of the code, utilizing all the contextual details \
available.)

Please ensure that the generated README adheres to Markdown syntax guidelines \
and includes the following sections:
## Overview
(In this section, your task is to create a single, well-structured \
paragraph that concisely communicates the reasons behind the \
repository's creation, its objectives, and the mechanics underlying \
its functionality.)

## Scripts
(Enumerate the names of root CLI files. Include a one-line sentence \
description for each file, detailing its intended purpose. If \
there are no relevant files, omit this section entirely.
----------

-Title (based on the root folder name)
-Description (one-line sentence of what the code does based on all the \
information).
-Overview (overview based on folder summaries)
-Scripts (List of root CLI files with one-sentence description of \
its purpose, if any, otherwise do not display this section).
Repository information: <<< {information} >>>

Here is readme information: <<< {information} >>>
Ensure proper formatting and adhere to Markdown syntax guidelines.
Do not add sections that are not listed in the provided structure.
"""
78 changes: 53 additions & 25 deletions devtale/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,17 @@
}


def split(code, language, chunk_size=1000, chunk_overlap=0):
def split_text(text, chunk_size=1000, chunk_overlap=0):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
docs = text_splitter.create_documents([text])
return docs


def split_code(code, language, chunk_size=1000, chunk_overlap=0):
code_splitter = RecursiveCharacterTextSplitter.from_language(
language=language, chunk_size=chunk_size, chunk_overlap=0
language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
docs = code_splitter.create_documents([code])
return docs
Expand Down Expand Up @@ -83,34 +91,35 @@ def prepare_code_elements(code_elements):
return elements


def redact_tale_information(content_type, information, verbose=False):
def redact_tale_information(
content_type, docs, verbose=False, model_name="text-davinci-003"
):
prompt = PromptTemplate(
template=TYPE_INFORMATION[content_type], input_variables=["information"]
)
teller_of_tales = LLMChain(llm=OpenAI(), prompt=prompt, verbose=verbose)
teller_of_tales = LLMChain(
llm=OpenAI(model_name=model_name), prompt=prompt, verbose=verbose
)
information = str(docs[0].page_content)

return teller_of_tales.run(str(information))
text_answer = teller_of_tales({"information": information})

if content_type == "folder-level":
json_answer = convert_to_json(text_answer)
if not json_answer:
print("Returning empty JSON due to a failure")
json_answer = {"folder_overview": "", "folder_readme": ""}
return json_answer

def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False):
parser = PydanticOutputParser(pydantic_object=FileDocumentation)
prompt = PromptTemplate(
template=CODE_LEVEL_TEMPLATE,
input_variables=["code", "code_elements"],
partial_variables={"format_instructions": parser.get_format_instructions()},
)
teller_of_tales = LLMChain(
llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose
)
return text_answer

result_string = teller_of_tales(
{"code": short_doc.page_content, "code_elements": code_elements}
)

def convert_to_json(text_answer):
try:
result_json = json.loads(result_string["text"])
result_json = json.loads(text_answer["text"])
except JSONDecodeError:
try:
text = result_string["text"].replace("\\n", "\n")
text = text_answer["text"].replace("\\n", "\n")
start_index = text.find("{")
end_index = text.rfind("}")

Expand All @@ -122,15 +131,34 @@ def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False):

except Exception as e:
print(
f"Error getting the JSON with the docstrings. \
Error: {e} \n Result: {result_string['text']}"
f"Error getting the JSON. \
Error: {e} \n Result: {text_answer['text']}"
)
print("Returning empty JSON instead")
empty = {"classes": [], "methods": []}
return empty
return None
return result_json


def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False):
parser = PydanticOutputParser(pydantic_object=FileDocumentation)
prompt = PromptTemplate(
template=CODE_LEVEL_TEMPLATE,
input_variables=["code", "code_elements"],
partial_variables={"format_instructions": parser.get_format_instructions()},
)
teller_of_tales = LLMChain(
llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose
)

result_string = teller_of_tales(
{"code": short_doc.page_content, "code_elements": code_elements}
)
json_answer = convert_to_json(result_string)
if not json_answer:
print("Returning empty JSON due to a failure")
json_answer = {"classes": [], "methods": []}
return json_answer


def is_hallucination(code_definition, code, expected_definitions):
# Verify that the code_definition is expected
if code_definition not in expected_definitions:
Expand Down