From 566b82065e58fc11c4f8f6b43668a1efdc7ea9e9 Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Mon, 25 Sep 2023 12:19:33 -0600 Subject: [PATCH 1/4] add logic to pre-estimate and estimate cost --- cli.py | 108 ++++++++++++++++++++++++++++++++----------- devtale/constants.py | 2 + devtale/utils.py | 56 ++++++++++++++++++---- 3 files changed, 129 insertions(+), 37 deletions(-) diff --git a/cli.py b/cli.py index a489717..6c32df2 100644 --- a/cli.py +++ b/cli.py @@ -28,6 +28,7 @@ redact_tale_information, split_code, split_text, + update_budget, ) DEFAULT_OUTPUT_PATH = "devtale_demo/" @@ -44,7 +45,9 @@ def process_repository( model_name: str = DEFAULT_MODEL_NAME, fuse: bool = False, debug: bool = False, + is_estimation: bool = True, ) -> None: + budget = 0 folder_tales = { "repository_name": os.path.basename(os.path.abspath(root_path)), "folders": [], @@ -90,7 +93,7 @@ def process_repository( folder_full_name = os.path.relpath(folder_path, root_path) - folder_readme, folder_tale = process_folder( + folder_readme, folder_tale, folder_budget = process_folder( folder_path=folder_path, output_path=os.path.join(output_path, folder_full_name) if folder_full_name != "." @@ -99,7 +102,9 @@ def process_repository( fuse=fuse, debug=debug, folder_full_name=folder_full_name, + is_estimation=is_estimation, ) + budget += folder_budget except Exception as e: folder_name = os.path.basename(folder_path) @@ -133,9 +138,13 @@ def process_repository( if folder_tales: folder_summaries = split_text(str(folder_tales), chunk_size=15000) - root_readme = redact_tale_information( - "root-level", folder_summaries, model_name="gpt-3.5-turbo-16k" - )["text"] + root_readme, tokens = redact_tale_information( + "root-level", + folder_summaries, + model_name="gpt-3.5-turbo-16k", + is_estimation=is_estimation, + ) + budget += update_budget(tokens, "gpt-3.5-turbo-16k") root_readme = root_readme.replace("----------", "") # inject folders information @@ -168,6 +177,8 @@ def process_repository( ) as file: file.write(root_readme) + return budget + def process_folder( folder_path: str, @@ -176,7 +187,9 @@ def process_folder( fuse: bool = False, debug: bool = False, folder_full_name: str = None, + is_estimation: bool = False, ) -> None: + budget = 0 save_path = os.path.join(output_path, os.path.basename(folder_path)) tales = [] @@ -189,7 +202,10 @@ def process_folder( ): logger.info(f"processing {file_path}") try: - file_tale = process_file(file_path, save_path, model_name, fuse, debug) + file_tale, file_budget = process_file( + file_path, save_path, model_name, fuse, debug, is_estimation + ) + budget += file_budget except Exception as e: logger.info( f"Failed to create dev tale for {file_path} - Exception: {e}" @@ -250,14 +266,22 @@ def process_folder( if tales: files_summaries = split_text(str(tales), chunk_size=10000) # split into two calls to avoid issues with json decoding markdow text. - folder_readme = redact_tale_information( - "folder-level", files_summaries, model_name="gpt-3.5-turbo-16k" - )["text"] + folder_readme, fl_tokens = redact_tale_information( + "folder-level", + files_summaries, + model_name="gpt-3.5-turbo-16k", + is_estimation=is_estimation, + ) folder_readme = folder_readme.replace("----------", "") - folder_overview = redact_tale_information( - "folder-description", folder_readme, model_name="gpt-3.5-turbo-16k" - )["text"] + folder_overview, fd_tokens = redact_tale_information( + "folder-description", + folder_readme, + model_name="gpt-3.5-turbo-16k", + is_estimation=is_estimation, + ) + + budget += update_budget(fl_tokens + fd_tokens, "gpt-3.5-turbo-16k") logger.info("save folder json..") with open(os.path.join(save_path, "folder_level.json"), "w") as json_file: @@ -267,8 +291,8 @@ def process_folder( with open(os.path.join(save_path, "README.md"), "w", encoding="utf-8") as file: file.write(folder_readme) - return folder_readme, folder_overview - return None + return folder_readme, folder_overview, budget + return None, None, budget def process_file( @@ -277,14 +301,16 @@ def process_file( model_name: str = DEFAULT_MODEL_NAME, fuse: bool = False, debug: bool = False, + is_estimation: bool = False, ) -> None: + budget = 0 file_name = os.path.basename(file_path) file_ext = os.path.splitext(file_name)[-1] save_path = os.path.join(output_path, f"{file_name}.json") if debug: logger.debug(f"FILE INFO:\nfile_path: {file_path}\nsave_path: {save_path}") - return {"file_docstring": "-"} + return {"file_docstring": "-"}, budget if not os.path.exists(output_path): os.makedirs(output_path) @@ -294,7 +320,7 @@ def process_file( code = file.read() if not code: - return {"file_docstring": ""} + return {"file_docstring": ""}, budget if os.path.exists(save_path): logger.info(f"Skipping {file_name} as its tale file already exists.") @@ -302,7 +328,7 @@ def process_file( found_tale = json.load(file) if fuse: fuse_documentation(code, found_tale, output_path, file_name, file_ext) - return found_tale + return found_tale, budget if not file_ext or file_ext in ALLOWED_NO_CODE_EXTENSIONS: # a small single chunk is enough @@ -311,10 +337,15 @@ def process_file( "file_name": file_name, "file_content": no_code_file, } - file_docstring = redact_tale_information("no-code-file", no_code_file_data)[ - "text" - ] - return {"file_docstring": file_docstring} + file_docstring, tokens = redact_tale_information( + content_type="no-code-file", + docs=no_code_file_data, + model_name="text-davinci-003", + is_estimation=is_estimation, + ) + budget += update_budget(tokens, "text-davinci-003") + + return {"file_docstring": file_docstring}, budget logger.info("split dev draft ideas") big_docs = split_code(code, language=LANGUAGES[file_ext], chunk_size=10000) @@ -323,7 +354,10 @@ def process_file( logger.info("extract code elements") code_elements = [] for idx, doc in enumerate(big_docs): - elements_set = extract_code_elements(doc) + elements_set, tokens = extract_code_elements( + big_doc=doc, model_name=model_name, is_estimation=is_estimation + ) + budget += update_budget(tokens, model_name) if elements_set: code_elements.append(elements_set) @@ -343,9 +377,15 @@ def process_file( logger.info("create tale sections") tales_list = [] # process only if we have elements to document - if code_elements_copy: + if code_elements_copy or is_estimation: for idx, doc in enumerate(short_docs): - tale = get_unit_tale(doc, code_elements_copy, model_name=model_name) + tale, tokens = get_unit_tale( + short_doc=doc, + code_elements=code_elements_copy, + model_name=model_name, + is_estimation=is_estimation, + ) + budget += update_budget(tokens, model_name) tales_list.append(tale) logger.info(f"tale section {str(idx+1)}/{len(short_docs)} done.") @@ -361,7 +401,13 @@ def process_file( logger.info("add dev tale summary") summaries = split_text(str(code_elements_dict["summary"]), chunk_size=9000) - file_docstring = redact_tale_information("top-level", summaries)["text"] + file_docstring, tokens = redact_tale_information( + content_type="top-level", + docs=summaries, + model_name="text-davinci-003", + is_estimation=is_estimation, + ) + budget += update_budget(tokens, "text-davinci-003") if fuse: # add docstring label only to insert it along the docstring into the code @@ -374,7 +420,7 @@ def process_file( with open(save_path, "w") as json_file: json.dump(tale, json_file, indent=2) - return tale + return tale, budget def fuse_documentation(code, tale, output_path, file_name, file_ext): @@ -461,34 +507,40 @@ def main( if os.path.isdir(path): if recursive: logger.info("Processing repository") - process_repository( + price = process_repository( root_path=path, output_path=output_path, model_name=model_name, fuse=fuse, debug=debug, + is_estimation=True, ) else: logger.info("Processing folder") - process_folder( + _, price = process_folder( folder_path=path, output_path=output_path, model_name=model_name, fuse=fuse, debug=debug, + is_estimation=False, ) elif os.path.isfile(path): logger.info("Processing file") - process_file( + _, price = process_file( file_path=path, output_path=output_path, model_name=model_name, fuse=fuse, debug=debug, + is_estimation=False, ) + else: raise f"Invalid input path {path}. Path must be a directory or code file." + logger.info(f"Rough cost = {price}") + if __name__ == "__main__": main() diff --git a/devtale/constants.py b/devtale/constants.py index 15c3b82..855f858 100644 --- a/devtale/constants.py +++ b/devtale/constants.py @@ -13,3 +13,5 @@ } DOCSTRING_LABEL = "@DEVTALE-GENERATED:" + +GPT_PRICE = {"gpt-4": 0.03, "gpt-3.5-turbo-16k": 0.03, "text-davinci-003": 0.0015} diff --git a/devtale/utils.py b/devtale/utils.py index afd303f..a784cd8 100644 --- a/devtale/utils.py +++ b/devtale/utils.py @@ -4,12 +4,13 @@ from json import JSONDecodeError from pathlib import Path +import tiktoken from langchain import LLMChain, OpenAI, PromptTemplate from langchain.chat_models import ChatOpenAI from langchain.output_parsers import PydanticOutputParser from langchain.text_splitter import RecursiveCharacterTextSplitter -from devtale.constants import DOCSTRING_LABEL +from devtale.constants import DOCSTRING_LABEL, GPT_PRICE from devtale.schema import FileDocumentation from devtale.templates import ( CODE_EXTRACTOR_TEMPLATE, @@ -30,6 +31,20 @@ } +def calc_tokens(input: str, model: str) -> int: + if model == "davinci": + encoding = "p50k_base" + else: + encoding = "cl100k_base" + + tokens = tiktoken.get_encoding(encoding).encode(input) + return len(tokens) + + +def update_budget(n_tokens, model: str): + return (n_tokens / 1000) * GPT_PRICE[model] + + def split_text(text, chunk_size=1000, chunk_overlap=0): text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap @@ -46,17 +61,23 @@ def split_code(code, language, chunk_size=1000, chunk_overlap=0): return docs -def extract_code_elements(big_doc, verbose=False): +def extract_code_elements( + big_doc, verbose=False, model_name="gpt-4", is_estimation=False +): prompt = PromptTemplate( template=CODE_EXTRACTOR_TEMPLATE, input_variables=["code"], ) extractor = LLMChain( - llm=ChatOpenAI(model_name="gpt-4"), prompt=prompt, verbose=verbose + llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose ) + tokens = calc_tokens(prompt.format(code=big_doc.page_content), model_name) + if is_estimation: + return "", tokens + result_string = extractor({"code": big_doc.page_content}) - return result_string["text"] + return result_string["text"], tokens def _process_extracted_code_element(text: str): @@ -97,7 +118,11 @@ def prepare_code_elements(code_elements): def redact_tale_information( - content_type, docs, verbose=False, model_name="text-davinci-003" + content_type, + docs, + verbose=False, + model_name="text-davinci-003", + is_estimation=False, ): prompt = PromptTemplate( template=TYPE_INFORMATION[content_type], input_variables=["information"] @@ -110,9 +135,13 @@ def redact_tale_information( else: information = str(docs) - text_answer = teller_of_tales({"information": information}) + tokens = calc_tokens(prompt.format(information=information), model_name) - return text_answer + if is_estimation: + return "", tokens + + text_answer = teller_of_tales({"information": information}) + return text_answer["text"], tokens def convert_to_json(text_answer): @@ -140,7 +169,9 @@ def convert_to_json(text_answer): return None -def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False): +def get_unit_tale( + short_doc, code_elements, model_name="gpt-4", verbose=False, is_estimation=False +): parser = PydanticOutputParser(pydantic_object=FileDocumentation) prompt = PromptTemplate( template=CODE_LEVEL_TEMPLATE, @@ -151,6 +182,13 @@ def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False): llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose ) + tokens = calc_tokens( + prompt.format(code=short_doc.page_content, code_elements=str(code_elements)), + model_name, + ) + if is_estimation: + return {"classes": [], "methods": []}, tokens + result_string = teller_of_tales( {"code": short_doc.page_content, "code_elements": code_elements} ) @@ -158,7 +196,7 @@ def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False): if not json_answer: print("Returning empty JSON due to a failure") json_answer = {"classes": [], "methods": []} - return json_answer + return json_answer, tokens def is_hallucination(code_definition, code, expected_definitions): From d320d6f2dfef724acf44b084c0543ad10443ead7 Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Mon, 25 Sep 2023 12:56:17 -0600 Subject: [PATCH 2/4] avoid saving tales if it is an estimation --- cli.py | 62 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/cli.py b/cli.py index 6c32df2..bd475dd 100644 --- a/cli.py +++ b/cli.py @@ -167,15 +167,16 @@ def process_repository( root_readme = root_readme + modified_original_readme - logger.info("save root json..") - with open(os.path.join(output_path, "root_level.json"), "w") as json_file: - json.dump(folder_tales, json_file, indent=2) + if not is_estimation: + logger.info("save root json..") + with open(os.path.join(output_path, "root_level.json"), "w") as json_file: + json.dump(folder_tales, json_file, indent=2) - logger.info(f"saving root index in {output_path}") - with open( - os.path.join(output_path, "README.md"), "w", encoding="utf-8" - ) as file: - file.write(root_readme) + logger.info(f"saving root index in {output_path}") + with open( + os.path.join(output_path, "README.md"), "w", encoding="utf-8" + ) as file: + file.write(root_readme) return budget @@ -261,7 +262,7 @@ def process_folder( """ ) logger.debug(f"FILE_TALES: {tales}") - return "-", "-" + return "-", "-", budget if tales: files_summaries = split_text(str(tales), chunk_size=10000) @@ -283,13 +284,16 @@ def process_folder( budget += update_budget(fl_tokens + fd_tokens, "gpt-3.5-turbo-16k") - logger.info("save folder json..") - with open(os.path.join(save_path, "folder_level.json"), "w") as json_file: - json.dump(tales, json_file, indent=2) + if not is_estimation: + logger.info("save folder json..") + with open(os.path.join(save_path, "folder_level.json"), "w") as json_file: + json.dump(tales, json_file, indent=2) - logger.info(f"saving index in {save_path}") - with open(os.path.join(save_path, "README.md"), "w", encoding="utf-8") as file: - file.write(folder_readme) + logger.info(f"saving index in {save_path}") + with open( + os.path.join(save_path, "README.md"), "w", encoding="utf-8" + ) as file: + file.write(folder_readme) return folder_readme, folder_overview, budget return None, None, budget @@ -409,7 +413,7 @@ def process_file( ) budget += update_budget(tokens, "text-davinci-003") - if fuse: + if fuse and not is_estimation: # add docstring label only to insert it along the docstring into the code tale["file_docstring"] = DOCSTRING_LABEL + "\n" + file_docstring fuse_documentation(code, tale, output_path, file_name, file_ext) @@ -417,8 +421,10 @@ def process_file( tale["file_docstring"] = file_docstring logger.info(f"save dev tale in: {save_path}") - with open(save_path, "w") as json_file: - json.dump(tale, json_file, indent=2) + + if not is_estimation: + with open(save_path, "w") as json_file: + json.dump(tale, json_file, indent=2) return tale, budget @@ -489,6 +495,14 @@ def fuse_documentation(code, tale, output_path, file_name, file_ext): default=False, help="Mock answer and avoid GPT calls", ) +@click.option( + "--estimation", + "is_estimation", + is_flag=True, + default=False, + help="True to calculate an approximate cost of documenting your code without \ + doing any GPT call", +) def main( path: str, recursive: bool, @@ -496,6 +510,7 @@ def main( output_path: str = DEFAULT_OUTPUT_PATH, model_name: str = DEFAULT_MODEL_NAME, debug: bool = False, + is_estimation: bool = False, ): load_dotenv() @@ -513,7 +528,7 @@ def main( model_name=model_name, fuse=fuse, debug=debug, - is_estimation=True, + is_estimation=is_estimation, ) else: logger.info("Processing folder") @@ -523,7 +538,7 @@ def main( model_name=model_name, fuse=fuse, debug=debug, - is_estimation=False, + is_estimation=is_estimation, ) elif os.path.isfile(path): logger.info("Processing file") @@ -533,13 +548,16 @@ def main( model_name=model_name, fuse=fuse, debug=debug, - is_estimation=False, + is_estimation=is_estimation, ) else: raise f"Invalid input path {path}. Path must be a directory or code file." - logger.info(f"Rough cost = {price}") + if is_estimation: + logger.info(f"Approximate cost: {price}") + else: + logger.info(f"Cost: {price}") if __name__ == "__main__": From 7b514f371de6718fcb051ff4d6925300f8495f82 Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Mon, 25 Sep 2023 13:36:34 -0600 Subject: [PATCH 3/4] use total cost if not estimation --- cli.py | 61 ++++++++++++++++++++++++------------------------ devtale/utils.py | 59 +++++++++++++++++++++++++++------------------- 2 files changed, 65 insertions(+), 55 deletions(-) diff --git a/cli.py b/cli.py index bd475dd..13e4110 100644 --- a/cli.py +++ b/cli.py @@ -28,7 +28,6 @@ redact_tale_information, split_code, split_text, - update_budget, ) DEFAULT_OUTPUT_PATH = "devtale_demo/" @@ -47,7 +46,7 @@ def process_repository( debug: bool = False, is_estimation: bool = True, ) -> None: - budget = 0 + cost = 0 folder_tales = { "repository_name": os.path.basename(os.path.abspath(root_path)), "folders": [], @@ -93,7 +92,7 @@ def process_repository( folder_full_name = os.path.relpath(folder_path, root_path) - folder_readme, folder_tale, folder_budget = process_folder( + folder_readme, folder_tale, folder_cost = process_folder( folder_path=folder_path, output_path=os.path.join(output_path, folder_full_name) if folder_full_name != "." @@ -104,7 +103,7 @@ def process_repository( folder_full_name=folder_full_name, is_estimation=is_estimation, ) - budget += folder_budget + cost += folder_cost except Exception as e: folder_name = os.path.basename(folder_path) @@ -138,13 +137,13 @@ def process_repository( if folder_tales: folder_summaries = split_text(str(folder_tales), chunk_size=15000) - root_readme, tokens = redact_tale_information( + root_readme, call_cost = redact_tale_information( "root-level", folder_summaries, model_name="gpt-3.5-turbo-16k", is_estimation=is_estimation, ) - budget += update_budget(tokens, "gpt-3.5-turbo-16k") + cost += call_cost root_readme = root_readme.replace("----------", "") # inject folders information @@ -178,7 +177,7 @@ def process_repository( ) as file: file.write(root_readme) - return budget + return cost def process_folder( @@ -190,7 +189,7 @@ def process_folder( folder_full_name: str = None, is_estimation: bool = False, ) -> None: - budget = 0 + cost = 0 save_path = os.path.join(output_path, os.path.basename(folder_path)) tales = [] @@ -203,10 +202,10 @@ def process_folder( ): logger.info(f"processing {file_path}") try: - file_tale, file_budget = process_file( + file_tale, file_cost = process_file( file_path, save_path, model_name, fuse, debug, is_estimation ) - budget += file_budget + cost += file_cost except Exception as e: logger.info( f"Failed to create dev tale for {file_path} - Exception: {e}" @@ -262,12 +261,12 @@ def process_folder( """ ) logger.debug(f"FILE_TALES: {tales}") - return "-", "-", budget + return "-", "-", cost if tales: files_summaries = split_text(str(tales), chunk_size=10000) # split into two calls to avoid issues with json decoding markdow text. - folder_readme, fl_tokens = redact_tale_information( + folder_readme, fl_cost = redact_tale_information( "folder-level", files_summaries, model_name="gpt-3.5-turbo-16k", @@ -275,14 +274,14 @@ def process_folder( ) folder_readme = folder_readme.replace("----------", "") - folder_overview, fd_tokens = redact_tale_information( + folder_overview, fd_cost = redact_tale_information( "folder-description", folder_readme, model_name="gpt-3.5-turbo-16k", is_estimation=is_estimation, ) - budget += update_budget(fl_tokens + fd_tokens, "gpt-3.5-turbo-16k") + cost += fl_cost + fd_cost if not is_estimation: logger.info("save folder json..") @@ -295,8 +294,8 @@ def process_folder( ) as file: file.write(folder_readme) - return folder_readme, folder_overview, budget - return None, None, budget + return folder_readme, folder_overview, cost + return None, None, cost def process_file( @@ -307,14 +306,14 @@ def process_file( debug: bool = False, is_estimation: bool = False, ) -> None: - budget = 0 + cost = 0 file_name = os.path.basename(file_path) file_ext = os.path.splitext(file_name)[-1] save_path = os.path.join(output_path, f"{file_name}.json") if debug: logger.debug(f"FILE INFO:\nfile_path: {file_path}\nsave_path: {save_path}") - return {"file_docstring": "-"}, budget + return {"file_docstring": "-"}, cost if not os.path.exists(output_path): os.makedirs(output_path) @@ -324,7 +323,7 @@ def process_file( code = file.read() if not code: - return {"file_docstring": ""}, budget + return {"file_docstring": ""}, cost if os.path.exists(save_path): logger.info(f"Skipping {file_name} as its tale file already exists.") @@ -332,7 +331,7 @@ def process_file( found_tale = json.load(file) if fuse: fuse_documentation(code, found_tale, output_path, file_name, file_ext) - return found_tale, budget + return found_tale, cost if not file_ext or file_ext in ALLOWED_NO_CODE_EXTENSIONS: # a small single chunk is enough @@ -341,15 +340,15 @@ def process_file( "file_name": file_name, "file_content": no_code_file, } - file_docstring, tokens = redact_tale_information( + file_docstring, call_cost = redact_tale_information( content_type="no-code-file", docs=no_code_file_data, model_name="text-davinci-003", is_estimation=is_estimation, ) - budget += update_budget(tokens, "text-davinci-003") + cost += call_cost - return {"file_docstring": file_docstring}, budget + return {"file_docstring": file_docstring}, cost logger.info("split dev draft ideas") big_docs = split_code(code, language=LANGUAGES[file_ext], chunk_size=10000) @@ -358,10 +357,10 @@ def process_file( logger.info("extract code elements") code_elements = [] for idx, doc in enumerate(big_docs): - elements_set, tokens = extract_code_elements( + elements_set, call_cost = extract_code_elements( big_doc=doc, model_name=model_name, is_estimation=is_estimation ) - budget += update_budget(tokens, model_name) + cost += call_cost if elements_set: code_elements.append(elements_set) @@ -383,13 +382,13 @@ def process_file( # process only if we have elements to document if code_elements_copy or is_estimation: for idx, doc in enumerate(short_docs): - tale, tokens = get_unit_tale( + tale, call_cost = get_unit_tale( short_doc=doc, code_elements=code_elements_copy, model_name=model_name, is_estimation=is_estimation, ) - budget += update_budget(tokens, model_name) + cost += call_cost tales_list.append(tale) logger.info(f"tale section {str(idx+1)}/{len(short_docs)} done.") @@ -405,13 +404,13 @@ def process_file( logger.info("add dev tale summary") summaries = split_text(str(code_elements_dict["summary"]), chunk_size=9000) - file_docstring, tokens = redact_tale_information( + file_docstring, call_cost = redact_tale_information( content_type="top-level", docs=summaries, model_name="text-davinci-003", is_estimation=is_estimation, ) - budget += update_budget(tokens, "text-davinci-003") + cost += call_cost if fuse and not is_estimation: # add docstring label only to insert it along the docstring into the code @@ -426,7 +425,7 @@ def process_file( with open(save_path, "w") as json_file: json.dump(tale, json_file, indent=2) - return tale, budget + return tale, cost def fuse_documentation(code, tale, output_path, file_name, file_ext): @@ -557,7 +556,7 @@ def main( if is_estimation: logger.info(f"Approximate cost: {price}") else: - logger.info(f"Cost: {price}") + logger.info(f"Total cost: {price}") if __name__ == "__main__": diff --git a/devtale/utils.py b/devtale/utils.py index a784cd8..ff9f9be 100644 --- a/devtale/utils.py +++ b/devtale/utils.py @@ -6,6 +6,7 @@ import tiktoken from langchain import LLMChain, OpenAI, PromptTemplate +from langchain.callbacks import get_openai_callback from langchain.chat_models import ChatOpenAI from langchain.output_parsers import PydanticOutputParser from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -31,18 +32,14 @@ } -def calc_tokens(input: str, model: str) -> int: +def calculate_cost(input: str, model: str): if model == "davinci": encoding = "p50k_base" else: encoding = "cl100k_base" tokens = tiktoken.get_encoding(encoding).encode(input) - return len(tokens) - - -def update_budget(n_tokens, model: str): - return (n_tokens / 1000) * GPT_PRICE[model] + return (len(tokens) / 1000) * GPT_PRICE[model] def split_text(text, chunk_size=1000, chunk_overlap=0): @@ -72,12 +69,17 @@ def extract_code_elements( llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose ) - tokens = calc_tokens(prompt.format(code=big_doc.page_content), model_name) if is_estimation: - return "", tokens + estimated_cost = calculate_cost( + prompt.format(code=big_doc.page_content), model_name + ) + return "", estimated_cost - result_string = extractor({"code": big_doc.page_content}) - return result_string["text"], tokens + with get_openai_callback() as cb: + result_string = extractor({"code": big_doc.page_content}) + cost = cb.total_cost + + return result_string["text"], cost def _process_extracted_code_element(text: str): @@ -135,13 +137,17 @@ def redact_tale_information( else: information = str(docs) - tokens = calc_tokens(prompt.format(information=information), model_name) - if is_estimation: - return "", tokens + estimated_cost = calculate_cost( + prompt.format(information=information), model_name + ) + return "", estimated_cost - text_answer = teller_of_tales({"information": information}) - return text_answer["text"], tokens + with get_openai_callback() as cb: + text_answer = teller_of_tales({"information": information}) + cost = cb.total_cost + + return text_answer["text"], cost def convert_to_json(text_answer): @@ -182,21 +188,26 @@ def get_unit_tale( llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose ) - tokens = calc_tokens( - prompt.format(code=short_doc.page_content, code_elements=str(code_elements)), - model_name, - ) if is_estimation: - return {"classes": [], "methods": []}, tokens + estimated_cost = calculate_cost( + prompt.format( + code=short_doc.page_content, code_elements=str(code_elements) + ), + model_name, + ) + return {"classes": [], "methods": []}, estimated_cost + + with get_openai_callback() as cb: + result_string = teller_of_tales( + {"code": short_doc.page_content, "code_elements": code_elements} + ) + cost = cb.total_cost - result_string = teller_of_tales( - {"code": short_doc.page_content, "code_elements": code_elements} - ) json_answer = convert_to_json(result_string) if not json_answer: print("Returning empty JSON due to a failure") json_answer = {"classes": [], "methods": []} - return json_answer, tokens + return json_answer, cost def is_hallucination(code_definition, code, expected_definitions): From 2cd6d10ad7671c5e7814fa8f8150703f3fb775fa Mon Sep 17 00:00:00 2001 From: Alberto Gaona Date: Tue, 26 Sep 2023 07:05:53 -0600 Subject: [PATCH 4/4] fix review comments --- cli.py | 49 ++++++++++++++++++++++---------------------- devtale/constants.py | 1 + devtale/utils.py | 14 ++++++------- 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/cli.py b/cli.py index 13e4110..1e77187 100644 --- a/cli.py +++ b/cli.py @@ -44,7 +44,7 @@ def process_repository( model_name: str = DEFAULT_MODEL_NAME, fuse: bool = False, debug: bool = False, - is_estimation: bool = True, + cost_estimation: bool = True, ) -> None: cost = 0 folder_tales = { @@ -101,7 +101,7 @@ def process_repository( fuse=fuse, debug=debug, folder_full_name=folder_full_name, - is_estimation=is_estimation, + cost_estimation=cost_estimation, ) cost += folder_cost @@ -141,7 +141,7 @@ def process_repository( "root-level", folder_summaries, model_name="gpt-3.5-turbo-16k", - is_estimation=is_estimation, + cost_estimation=cost_estimation, ) cost += call_cost root_readme = root_readme.replace("----------", "") @@ -166,7 +166,7 @@ def process_repository( root_readme = root_readme + modified_original_readme - if not is_estimation: + if not cost_estimation: logger.info("save root json..") with open(os.path.join(output_path, "root_level.json"), "w") as json_file: json.dump(folder_tales, json_file, indent=2) @@ -187,7 +187,7 @@ def process_folder( fuse: bool = False, debug: bool = False, folder_full_name: str = None, - is_estimation: bool = False, + cost_estimation: bool = False, ) -> None: cost = 0 save_path = os.path.join(output_path, os.path.basename(folder_path)) @@ -203,7 +203,7 @@ def process_folder( logger.info(f"processing {file_path}") try: file_tale, file_cost = process_file( - file_path, save_path, model_name, fuse, debug, is_estimation + file_path, save_path, model_name, fuse, debug, cost_estimation ) cost += file_cost except Exception as e: @@ -270,7 +270,7 @@ def process_folder( "folder-level", files_summaries, model_name="gpt-3.5-turbo-16k", - is_estimation=is_estimation, + cost_estimation=cost_estimation, ) folder_readme = folder_readme.replace("----------", "") @@ -278,12 +278,12 @@ def process_folder( "folder-description", folder_readme, model_name="gpt-3.5-turbo-16k", - is_estimation=is_estimation, + cost_estimation=cost_estimation, ) cost += fl_cost + fd_cost - if not is_estimation: + if not cost_estimation: logger.info("save folder json..") with open(os.path.join(save_path, "folder_level.json"), "w") as json_file: json.dump(tales, json_file, indent=2) @@ -304,7 +304,7 @@ def process_file( model_name: str = DEFAULT_MODEL_NAME, fuse: bool = False, debug: bool = False, - is_estimation: bool = False, + cost_estimation: bool = False, ) -> None: cost = 0 file_name = os.path.basename(file_path) @@ -344,7 +344,7 @@ def process_file( content_type="no-code-file", docs=no_code_file_data, model_name="text-davinci-003", - is_estimation=is_estimation, + cost_estimation=cost_estimation, ) cost += call_cost @@ -358,7 +358,7 @@ def process_file( code_elements = [] for idx, doc in enumerate(big_docs): elements_set, call_cost = extract_code_elements( - big_doc=doc, model_name=model_name, is_estimation=is_estimation + big_doc=doc, model_name=model_name, cost_estimation=cost_estimation ) cost += call_cost if elements_set: @@ -380,13 +380,13 @@ def process_file( logger.info("create tale sections") tales_list = [] # process only if we have elements to document - if code_elements_copy or is_estimation: + if code_elements_copy or cost_estimation: for idx, doc in enumerate(short_docs): tale, call_cost = get_unit_tale( short_doc=doc, code_elements=code_elements_copy, model_name=model_name, - is_estimation=is_estimation, + cost_estimation=cost_estimation, ) cost += call_cost tales_list.append(tale) @@ -408,11 +408,11 @@ def process_file( content_type="top-level", docs=summaries, model_name="text-davinci-003", - is_estimation=is_estimation, + cost_estimation=cost_estimation, ) cost += call_cost - if fuse and not is_estimation: + if fuse and not cost_estimation: # add docstring label only to insert it along the docstring into the code tale["file_docstring"] = DOCSTRING_LABEL + "\n" + file_docstring fuse_documentation(code, tale, output_path, file_name, file_ext) @@ -421,7 +421,7 @@ def process_file( logger.info(f"save dev tale in: {save_path}") - if not is_estimation: + if not cost_estimation: with open(save_path, "w") as json_file: json.dump(tale, json_file, indent=2) @@ -496,11 +496,10 @@ def fuse_documentation(code, tale, output_path, file_name, file_ext): ) @click.option( "--estimation", - "is_estimation", + "cost_estimation", is_flag=True, default=False, - help="True to calculate an approximate cost of documenting your code without \ - doing any GPT call", + help="When true, estimate the cost of openAI's API usage, without making any call", ) def main( path: str, @@ -509,7 +508,7 @@ def main( output_path: str = DEFAULT_OUTPUT_PATH, model_name: str = DEFAULT_MODEL_NAME, debug: bool = False, - is_estimation: bool = False, + cost_estimation: bool = False, ): load_dotenv() @@ -527,7 +526,7 @@ def main( model_name=model_name, fuse=fuse, debug=debug, - is_estimation=is_estimation, + cost_estimation=cost_estimation, ) else: logger.info("Processing folder") @@ -537,7 +536,7 @@ def main( model_name=model_name, fuse=fuse, debug=debug, - is_estimation=is_estimation, + cost_estimation=cost_estimation, ) elif os.path.isfile(path): logger.info("Processing file") @@ -547,13 +546,13 @@ def main( model_name=model_name, fuse=fuse, debug=debug, - is_estimation=is_estimation, + cost_estimation=cost_estimation, ) else: raise f"Invalid input path {path}. Path must be a directory or code file." - if is_estimation: + if cost_estimation: logger.info(f"Approximate cost: {price}") else: logger.info(f"Total cost: {price}") diff --git a/devtale/constants.py b/devtale/constants.py index 855f858..84462b9 100644 --- a/devtale/constants.py +++ b/devtale/constants.py @@ -14,4 +14,5 @@ DOCSTRING_LABEL = "@DEVTALE-GENERATED:" +# Extracted from https://openai.com/pricing on September 26th, 2023. GPT_PRICE = {"gpt-4": 0.03, "gpt-3.5-turbo-16k": 0.03, "text-davinci-003": 0.0015} diff --git a/devtale/utils.py b/devtale/utils.py index ff9f9be..ec7dd8e 100644 --- a/devtale/utils.py +++ b/devtale/utils.py @@ -33,7 +33,7 @@ def calculate_cost(input: str, model: str): - if model == "davinci": + if model == "text-davinci-003": encoding = "p50k_base" else: encoding = "cl100k_base" @@ -59,7 +59,7 @@ def split_code(code, language, chunk_size=1000, chunk_overlap=0): def extract_code_elements( - big_doc, verbose=False, model_name="gpt-4", is_estimation=False + big_doc, verbose=False, model_name="gpt-4", cost_estimation=False ): prompt = PromptTemplate( template=CODE_EXTRACTOR_TEMPLATE, @@ -69,7 +69,7 @@ def extract_code_elements( llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose ) - if is_estimation: + if cost_estimation: estimated_cost = calculate_cost( prompt.format(code=big_doc.page_content), model_name ) @@ -124,7 +124,7 @@ def redact_tale_information( docs, verbose=False, model_name="text-davinci-003", - is_estimation=False, + cost_estimation=False, ): prompt = PromptTemplate( template=TYPE_INFORMATION[content_type], input_variables=["information"] @@ -137,7 +137,7 @@ def redact_tale_information( else: information = str(docs) - if is_estimation: + if cost_estimation: estimated_cost = calculate_cost( prompt.format(information=information), model_name ) @@ -176,7 +176,7 @@ def convert_to_json(text_answer): def get_unit_tale( - short_doc, code_elements, model_name="gpt-4", verbose=False, is_estimation=False + short_doc, code_elements, model_name="gpt-4", verbose=False, cost_estimation=False ): parser = PydanticOutputParser(pydantic_object=FileDocumentation) prompt = PromptTemplate( @@ -188,7 +188,7 @@ def get_unit_tale( llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose ) - if is_estimation: + if cost_estimation: estimated_cost = calculate_cost( prompt.format( code=short_doc.page_content, code_elements=str(code_elements)