From 566b82065e58fc11c4f8f6b43668a1efdc7ea9e9 Mon Sep 17 00:00:00 2001
From: Alberto Gaona <albertoo_3c@hotmail.com>
Date: Mon, 25 Sep 2023 12:19:33 -0600
Subject: [PATCH 1/4] add logic to pre-estimate and estimate cost

---
 cli.py               | 108 ++++++++++++++++++++++++++++++++-----------
 devtale/constants.py |   2 +
 devtale/utils.py     |  56 ++++++++++++++++++----
 3 files changed, 129 insertions(+), 37 deletions(-)

diff --git a/cli.py b/cli.py
index a489717..6c32df2 100644
--- a/cli.py
+++ b/cli.py
@@ -28,6 +28,7 @@
     redact_tale_information,
     split_code,
     split_text,
+    update_budget,
 )
 
 DEFAULT_OUTPUT_PATH = "devtale_demo/"
@@ -44,7 +45,9 @@ def process_repository(
     model_name: str = DEFAULT_MODEL_NAME,
     fuse: bool = False,
     debug: bool = False,
+    is_estimation: bool = True,
 ) -> None:
+    budget = 0
     folder_tales = {
         "repository_name": os.path.basename(os.path.abspath(root_path)),
         "folders": [],
@@ -90,7 +93,7 @@ def process_repository(
 
             folder_full_name = os.path.relpath(folder_path, root_path)
 
-            folder_readme, folder_tale = process_folder(
+            folder_readme, folder_tale, folder_budget = process_folder(
                 folder_path=folder_path,
                 output_path=os.path.join(output_path, folder_full_name)
                 if folder_full_name != "."
@@ -99,7 +102,9 @@ def process_repository(
                 fuse=fuse,
                 debug=debug,
                 folder_full_name=folder_full_name,
+                is_estimation=is_estimation,
             )
+            budget += folder_budget
 
         except Exception as e:
             folder_name = os.path.basename(folder_path)
@@ -133,9 +138,13 @@ def process_repository(
 
     if folder_tales:
         folder_summaries = split_text(str(folder_tales), chunk_size=15000)
-        root_readme = redact_tale_information(
-            "root-level", folder_summaries, model_name="gpt-3.5-turbo-16k"
-        )["text"]
+        root_readme, tokens = redact_tale_information(
+            "root-level",
+            folder_summaries,
+            model_name="gpt-3.5-turbo-16k",
+            is_estimation=is_estimation,
+        )
+        budget += update_budget(tokens, "gpt-3.5-turbo-16k")
         root_readme = root_readme.replace("----------", "")
 
         # inject folders information
@@ -168,6 +177,8 @@ def process_repository(
         ) as file:
             file.write(root_readme)
 
+    return budget
+
 
 def process_folder(
     folder_path: str,
@@ -176,7 +187,9 @@ def process_folder(
     fuse: bool = False,
     debug: bool = False,
     folder_full_name: str = None,
+    is_estimation: bool = False,
 ) -> None:
+    budget = 0
     save_path = os.path.join(output_path, os.path.basename(folder_path))
     tales = []
 
@@ -189,7 +202,10 @@ def process_folder(
         ):
             logger.info(f"processing {file_path}")
             try:
-                file_tale = process_file(file_path, save_path, model_name, fuse, debug)
+                file_tale, file_budget = process_file(
+                    file_path, save_path, model_name, fuse, debug, is_estimation
+                )
+                budget += file_budget
             except Exception as e:
                 logger.info(
                     f"Failed to create dev tale for {file_path} - Exception: {e}"
@@ -250,14 +266,22 @@ def process_folder(
     if tales:
         files_summaries = split_text(str(tales), chunk_size=10000)
         # split into two calls to avoid issues with json decoding markdow text.
-        folder_readme = redact_tale_information(
-            "folder-level", files_summaries, model_name="gpt-3.5-turbo-16k"
-        )["text"]
+        folder_readme, fl_tokens = redact_tale_information(
+            "folder-level",
+            files_summaries,
+            model_name="gpt-3.5-turbo-16k",
+            is_estimation=is_estimation,
+        )
         folder_readme = folder_readme.replace("----------", "")
 
-        folder_overview = redact_tale_information(
-            "folder-description", folder_readme, model_name="gpt-3.5-turbo-16k"
-        )["text"]
+        folder_overview, fd_tokens = redact_tale_information(
+            "folder-description",
+            folder_readme,
+            model_name="gpt-3.5-turbo-16k",
+            is_estimation=is_estimation,
+        )
+
+        budget += update_budget(fl_tokens + fd_tokens, "gpt-3.5-turbo-16k")
 
         logger.info("save folder json..")
         with open(os.path.join(save_path, "folder_level.json"), "w") as json_file:
@@ -267,8 +291,8 @@ def process_folder(
         with open(os.path.join(save_path, "README.md"), "w", encoding="utf-8") as file:
             file.write(folder_readme)
 
-        return folder_readme, folder_overview
-    return None
+        return folder_readme, folder_overview, budget
+    return None, None, budget
 
 
 def process_file(
@@ -277,14 +301,16 @@ def process_file(
     model_name: str = DEFAULT_MODEL_NAME,
     fuse: bool = False,
     debug: bool = False,
+    is_estimation: bool = False,
 ) -> None:
+    budget = 0
     file_name = os.path.basename(file_path)
     file_ext = os.path.splitext(file_name)[-1]
     save_path = os.path.join(output_path, f"{file_name}.json")
 
     if debug:
         logger.debug(f"FILE INFO:\nfile_path: {file_path}\nsave_path: {save_path}")
-        return {"file_docstring": "-"}
+        return {"file_docstring": "-"}, budget
 
     if not os.path.exists(output_path):
         os.makedirs(output_path)
@@ -294,7 +320,7 @@ def process_file(
         code = file.read()
 
     if not code:
-        return {"file_docstring": ""}
+        return {"file_docstring": ""}, budget
 
     if os.path.exists(save_path):
         logger.info(f"Skipping {file_name} as its tale file already exists.")
@@ -302,7 +328,7 @@ def process_file(
             found_tale = json.load(file)
         if fuse:
             fuse_documentation(code, found_tale, output_path, file_name, file_ext)
-        return found_tale
+        return found_tale, budget
 
     if not file_ext or file_ext in ALLOWED_NO_CODE_EXTENSIONS:
         # a small single chunk is enough
@@ -311,10 +337,15 @@ def process_file(
             "file_name": file_name,
             "file_content": no_code_file,
         }
-        file_docstring = redact_tale_information("no-code-file", no_code_file_data)[
-            "text"
-        ]
-        return {"file_docstring": file_docstring}
+        file_docstring, tokens = redact_tale_information(
+            content_type="no-code-file",
+            docs=no_code_file_data,
+            model_name="text-davinci-003",
+            is_estimation=is_estimation,
+        )
+        budget += update_budget(tokens, "text-davinci-003")
+
+        return {"file_docstring": file_docstring}, budget
 
     logger.info("split dev draft ideas")
     big_docs = split_code(code, language=LANGUAGES[file_ext], chunk_size=10000)
@@ -323,7 +354,10 @@ def process_file(
     logger.info("extract code elements")
     code_elements = []
     for idx, doc in enumerate(big_docs):
-        elements_set = extract_code_elements(doc)
+        elements_set, tokens = extract_code_elements(
+            big_doc=doc, model_name=model_name, is_estimation=is_estimation
+        )
+        budget += update_budget(tokens, model_name)
         if elements_set:
             code_elements.append(elements_set)
 
@@ -343,9 +377,15 @@ def process_file(
     logger.info("create tale sections")
     tales_list = []
     # process only if we have elements to document
-    if code_elements_copy:
+    if code_elements_copy or is_estimation:
         for idx, doc in enumerate(short_docs):
-            tale = get_unit_tale(doc, code_elements_copy, model_name=model_name)
+            tale, tokens = get_unit_tale(
+                short_doc=doc,
+                code_elements=code_elements_copy,
+                model_name=model_name,
+                is_estimation=is_estimation,
+            )
+            budget += update_budget(tokens, model_name)
             tales_list.append(tale)
             logger.info(f"tale section {str(idx+1)}/{len(short_docs)} done.")
 
@@ -361,7 +401,13 @@ def process_file(
     logger.info("add dev tale summary")
     summaries = split_text(str(code_elements_dict["summary"]), chunk_size=9000)
 
-    file_docstring = redact_tale_information("top-level", summaries)["text"]
+    file_docstring, tokens = redact_tale_information(
+        content_type="top-level",
+        docs=summaries,
+        model_name="text-davinci-003",
+        is_estimation=is_estimation,
+    )
+    budget += update_budget(tokens, "text-davinci-003")
 
     if fuse:
         # add docstring label only to insert it along the docstring into the code
@@ -374,7 +420,7 @@ def process_file(
     with open(save_path, "w") as json_file:
         json.dump(tale, json_file, indent=2)
 
-    return tale
+    return tale, budget
 
 
 def fuse_documentation(code, tale, output_path, file_name, file_ext):
@@ -461,34 +507,40 @@ def main(
     if os.path.isdir(path):
         if recursive:
             logger.info("Processing repository")
-            process_repository(
+            price = process_repository(
                 root_path=path,
                 output_path=output_path,
                 model_name=model_name,
                 fuse=fuse,
                 debug=debug,
+                is_estimation=True,
             )
         else:
             logger.info("Processing folder")
-            process_folder(
+            _, price = process_folder(
                 folder_path=path,
                 output_path=output_path,
                 model_name=model_name,
                 fuse=fuse,
                 debug=debug,
+                is_estimation=False,
             )
     elif os.path.isfile(path):
         logger.info("Processing file")
-        process_file(
+        _, price = process_file(
             file_path=path,
             output_path=output_path,
             model_name=model_name,
             fuse=fuse,
             debug=debug,
+            is_estimation=False,
         )
+
     else:
         raise f"Invalid input path {path}. Path must be a directory or code file."
 
+    logger.info(f"Rough cost = {price}")
+
 
 if __name__ == "__main__":
     main()
diff --git a/devtale/constants.py b/devtale/constants.py
index 15c3b82..855f858 100644
--- a/devtale/constants.py
+++ b/devtale/constants.py
@@ -13,3 +13,5 @@
 }
 
 DOCSTRING_LABEL = "@DEVTALE-GENERATED:"
+
+GPT_PRICE = {"gpt-4": 0.03, "gpt-3.5-turbo-16k": 0.03, "text-davinci-003": 0.0015}
diff --git a/devtale/utils.py b/devtale/utils.py
index afd303f..a784cd8 100644
--- a/devtale/utils.py
+++ b/devtale/utils.py
@@ -4,12 +4,13 @@
 from json import JSONDecodeError
 from pathlib import Path
 
+import tiktoken
 from langchain import LLMChain, OpenAI, PromptTemplate
 from langchain.chat_models import ChatOpenAI
 from langchain.output_parsers import PydanticOutputParser
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 
-from devtale.constants import DOCSTRING_LABEL
+from devtale.constants import DOCSTRING_LABEL, GPT_PRICE
 from devtale.schema import FileDocumentation
 from devtale.templates import (
     CODE_EXTRACTOR_TEMPLATE,
@@ -30,6 +31,20 @@
 }
 
 
+def calc_tokens(input: str, model: str) -> int:
+    if model == "davinci":
+        encoding = "p50k_base"
+    else:
+        encoding = "cl100k_base"
+
+    tokens = tiktoken.get_encoding(encoding).encode(input)
+    return len(tokens)
+
+
+def update_budget(n_tokens, model: str):
+    return (n_tokens / 1000) * GPT_PRICE[model]
+
+
 def split_text(text, chunk_size=1000, chunk_overlap=0):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=chunk_size, chunk_overlap=chunk_overlap
@@ -46,17 +61,23 @@ def split_code(code, language, chunk_size=1000, chunk_overlap=0):
     return docs
 
 
-def extract_code_elements(big_doc, verbose=False):
+def extract_code_elements(
+    big_doc, verbose=False, model_name="gpt-4", is_estimation=False
+):
     prompt = PromptTemplate(
         template=CODE_EXTRACTOR_TEMPLATE,
         input_variables=["code"],
     )
     extractor = LLMChain(
-        llm=ChatOpenAI(model_name="gpt-4"), prompt=prompt, verbose=verbose
+        llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose
     )
 
+    tokens = calc_tokens(prompt.format(code=big_doc.page_content), model_name)
+    if is_estimation:
+        return "", tokens
+
     result_string = extractor({"code": big_doc.page_content})
-    return result_string["text"]
+    return result_string["text"], tokens
 
 
 def _process_extracted_code_element(text: str):
@@ -97,7 +118,11 @@ def prepare_code_elements(code_elements):
 
 
 def redact_tale_information(
-    content_type, docs, verbose=False, model_name="text-davinci-003"
+    content_type,
+    docs,
+    verbose=False,
+    model_name="text-davinci-003",
+    is_estimation=False,
 ):
     prompt = PromptTemplate(
         template=TYPE_INFORMATION[content_type], input_variables=["information"]
@@ -110,9 +135,13 @@ def redact_tale_information(
     else:
         information = str(docs)
 
-    text_answer = teller_of_tales({"information": information})
+    tokens = calc_tokens(prompt.format(information=information), model_name)
 
-    return text_answer
+    if is_estimation:
+        return "", tokens
+
+    text_answer = teller_of_tales({"information": information})
+    return text_answer["text"], tokens
 
 
 def convert_to_json(text_answer):
@@ -140,7 +169,9 @@ def convert_to_json(text_answer):
             return None
 
 
-def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False):
+def get_unit_tale(
+    short_doc, code_elements, model_name="gpt-4", verbose=False, is_estimation=False
+):
     parser = PydanticOutputParser(pydantic_object=FileDocumentation)
     prompt = PromptTemplate(
         template=CODE_LEVEL_TEMPLATE,
@@ -151,6 +182,13 @@ def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False):
         llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose
     )
 
+    tokens = calc_tokens(
+        prompt.format(code=short_doc.page_content, code_elements=str(code_elements)),
+        model_name,
+    )
+    if is_estimation:
+        return {"classes": [], "methods": []}, tokens
+
     result_string = teller_of_tales(
         {"code": short_doc.page_content, "code_elements": code_elements}
     )
@@ -158,7 +196,7 @@ def get_unit_tale(short_doc, code_elements, model_name="gpt-4", verbose=False):
     if not json_answer:
         print("Returning empty JSON due to a failure")
         json_answer = {"classes": [], "methods": []}
-    return json_answer
+    return json_answer, tokens
 
 
 def is_hallucination(code_definition, code, expected_definitions):

From d320d6f2dfef724acf44b084c0543ad10443ead7 Mon Sep 17 00:00:00 2001
From: Alberto Gaona <albertoo_3c@hotmail.com>
Date: Mon, 25 Sep 2023 12:56:17 -0600
Subject: [PATCH 2/4] avoid saving tales if it is an estimation

---
 cli.py | 62 +++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 22 deletions(-)

diff --git a/cli.py b/cli.py
index 6c32df2..bd475dd 100644
--- a/cli.py
+++ b/cli.py
@@ -167,15 +167,16 @@ def process_repository(
 
             root_readme = root_readme + modified_original_readme
 
-        logger.info("save root json..")
-        with open(os.path.join(output_path, "root_level.json"), "w") as json_file:
-            json.dump(folder_tales, json_file, indent=2)
+        if not is_estimation:
+            logger.info("save root json..")
+            with open(os.path.join(output_path, "root_level.json"), "w") as json_file:
+                json.dump(folder_tales, json_file, indent=2)
 
-        logger.info(f"saving root index in {output_path}")
-        with open(
-            os.path.join(output_path, "README.md"), "w", encoding="utf-8"
-        ) as file:
-            file.write(root_readme)
+            logger.info(f"saving root index in {output_path}")
+            with open(
+                os.path.join(output_path, "README.md"), "w", encoding="utf-8"
+            ) as file:
+                file.write(root_readme)
 
     return budget
 
@@ -261,7 +262,7 @@ def process_folder(
         """
         )
         logger.debug(f"FILE_TALES: {tales}")
-        return "-", "-"
+        return "-", "-", budget
 
     if tales:
         files_summaries = split_text(str(tales), chunk_size=10000)
@@ -283,13 +284,16 @@ def process_folder(
 
         budget += update_budget(fl_tokens + fd_tokens, "gpt-3.5-turbo-16k")
 
-        logger.info("save folder json..")
-        with open(os.path.join(save_path, "folder_level.json"), "w") as json_file:
-            json.dump(tales, json_file, indent=2)
+        if not is_estimation:
+            logger.info("save folder json..")
+            with open(os.path.join(save_path, "folder_level.json"), "w") as json_file:
+                json.dump(tales, json_file, indent=2)
 
-        logger.info(f"saving index in {save_path}")
-        with open(os.path.join(save_path, "README.md"), "w", encoding="utf-8") as file:
-            file.write(folder_readme)
+            logger.info(f"saving index in {save_path}")
+            with open(
+                os.path.join(save_path, "README.md"), "w", encoding="utf-8"
+            ) as file:
+                file.write(folder_readme)
 
         return folder_readme, folder_overview, budget
     return None, None, budget
@@ -409,7 +413,7 @@ def process_file(
     )
     budget += update_budget(tokens, "text-davinci-003")
 
-    if fuse:
+    if fuse and not is_estimation:
         # add docstring label only to insert it along the docstring into the code
         tale["file_docstring"] = DOCSTRING_LABEL + "\n" + file_docstring
         fuse_documentation(code, tale, output_path, file_name, file_ext)
@@ -417,8 +421,10 @@ def process_file(
     tale["file_docstring"] = file_docstring
 
     logger.info(f"save dev tale in: {save_path}")
-    with open(save_path, "w") as json_file:
-        json.dump(tale, json_file, indent=2)
+
+    if not is_estimation:
+        with open(save_path, "w") as json_file:
+            json.dump(tale, json_file, indent=2)
 
     return tale, budget
 
@@ -489,6 +495,14 @@ def fuse_documentation(code, tale, output_path, file_name, file_ext):
     default=False,
     help="Mock answer and avoid GPT calls",
 )
+@click.option(
+    "--estimation",
+    "is_estimation",
+    is_flag=True,
+    default=False,
+    help="True to calculate an approximate cost of documenting your code without \
+          doing any GPT call",
+)
 def main(
     path: str,
     recursive: bool,
@@ -496,6 +510,7 @@ def main(
     output_path: str = DEFAULT_OUTPUT_PATH,
     model_name: str = DEFAULT_MODEL_NAME,
     debug: bool = False,
+    is_estimation: bool = False,
 ):
     load_dotenv()
 
@@ -513,7 +528,7 @@ def main(
                 model_name=model_name,
                 fuse=fuse,
                 debug=debug,
-                is_estimation=True,
+                is_estimation=is_estimation,
             )
         else:
             logger.info("Processing folder")
@@ -523,7 +538,7 @@ def main(
                 model_name=model_name,
                 fuse=fuse,
                 debug=debug,
-                is_estimation=False,
+                is_estimation=is_estimation,
             )
     elif os.path.isfile(path):
         logger.info("Processing file")
@@ -533,13 +548,16 @@ def main(
             model_name=model_name,
             fuse=fuse,
             debug=debug,
-            is_estimation=False,
+            is_estimation=is_estimation,
         )
 
     else:
         raise f"Invalid input path {path}. Path must be a directory or code file."
 
-    logger.info(f"Rough cost = {price}")
+    if is_estimation:
+        logger.info(f"Approximate cost: {price}")
+    else:
+        logger.info(f"Cost: {price}")
 
 
 if __name__ == "__main__":

From 7b514f371de6718fcb051ff4d6925300f8495f82 Mon Sep 17 00:00:00 2001
From: Alberto Gaona <albertoo_3c@hotmail.com>
Date: Mon, 25 Sep 2023 13:36:34 -0600
Subject: [PATCH 3/4] use total cost if not estimation

---
 cli.py           | 61 ++++++++++++++++++++++++------------------------
 devtale/utils.py | 59 +++++++++++++++++++++++++++-------------------
 2 files changed, 65 insertions(+), 55 deletions(-)

diff --git a/cli.py b/cli.py
index bd475dd..13e4110 100644
--- a/cli.py
+++ b/cli.py
@@ -28,7 +28,6 @@
     redact_tale_information,
     split_code,
     split_text,
-    update_budget,
 )
 
 DEFAULT_OUTPUT_PATH = "devtale_demo/"
@@ -47,7 +46,7 @@ def process_repository(
     debug: bool = False,
     is_estimation: bool = True,
 ) -> None:
-    budget = 0
+    cost = 0
     folder_tales = {
         "repository_name": os.path.basename(os.path.abspath(root_path)),
         "folders": [],
@@ -93,7 +92,7 @@ def process_repository(
 
             folder_full_name = os.path.relpath(folder_path, root_path)
 
-            folder_readme, folder_tale, folder_budget = process_folder(
+            folder_readme, folder_tale, folder_cost = process_folder(
                 folder_path=folder_path,
                 output_path=os.path.join(output_path, folder_full_name)
                 if folder_full_name != "."
@@ -104,7 +103,7 @@ def process_repository(
                 folder_full_name=folder_full_name,
                 is_estimation=is_estimation,
             )
-            budget += folder_budget
+            cost += folder_cost
 
         except Exception as e:
             folder_name = os.path.basename(folder_path)
@@ -138,13 +137,13 @@ def process_repository(
 
     if folder_tales:
         folder_summaries = split_text(str(folder_tales), chunk_size=15000)
-        root_readme, tokens = redact_tale_information(
+        root_readme, call_cost = redact_tale_information(
             "root-level",
             folder_summaries,
             model_name="gpt-3.5-turbo-16k",
             is_estimation=is_estimation,
         )
-        budget += update_budget(tokens, "gpt-3.5-turbo-16k")
+        cost += call_cost
         root_readme = root_readme.replace("----------", "")
 
         # inject folders information
@@ -178,7 +177,7 @@ def process_repository(
             ) as file:
                 file.write(root_readme)
 
-    return budget
+    return cost
 
 
 def process_folder(
@@ -190,7 +189,7 @@ def process_folder(
     folder_full_name: str = None,
     is_estimation: bool = False,
 ) -> None:
-    budget = 0
+    cost = 0
     save_path = os.path.join(output_path, os.path.basename(folder_path))
     tales = []
 
@@ -203,10 +202,10 @@ def process_folder(
         ):
             logger.info(f"processing {file_path}")
             try:
-                file_tale, file_budget = process_file(
+                file_tale, file_cost = process_file(
                     file_path, save_path, model_name, fuse, debug, is_estimation
                 )
-                budget += file_budget
+                cost += file_cost
             except Exception as e:
                 logger.info(
                     f"Failed to create dev tale for {file_path} - Exception: {e}"
@@ -262,12 +261,12 @@ def process_folder(
         """
         )
         logger.debug(f"FILE_TALES: {tales}")
-        return "-", "-", budget
+        return "-", "-", cost
 
     if tales:
         files_summaries = split_text(str(tales), chunk_size=10000)
         # split into two calls to avoid issues with json decoding markdow text.
-        folder_readme, fl_tokens = redact_tale_information(
+        folder_readme, fl_cost = redact_tale_information(
             "folder-level",
             files_summaries,
             model_name="gpt-3.5-turbo-16k",
@@ -275,14 +274,14 @@ def process_folder(
         )
         folder_readme = folder_readme.replace("----------", "")
 
-        folder_overview, fd_tokens = redact_tale_information(
+        folder_overview, fd_cost = redact_tale_information(
             "folder-description",
             folder_readme,
             model_name="gpt-3.5-turbo-16k",
             is_estimation=is_estimation,
         )
 
-        budget += update_budget(fl_tokens + fd_tokens, "gpt-3.5-turbo-16k")
+        cost += fl_cost + fd_cost
 
         if not is_estimation:
             logger.info("save folder json..")
@@ -295,8 +294,8 @@ def process_folder(
             ) as file:
                 file.write(folder_readme)
 
-        return folder_readme, folder_overview, budget
-    return None, None, budget
+        return folder_readme, folder_overview, cost
+    return None, None, cost
 
 
 def process_file(
@@ -307,14 +306,14 @@ def process_file(
     debug: bool = False,
     is_estimation: bool = False,
 ) -> None:
-    budget = 0
+    cost = 0
     file_name = os.path.basename(file_path)
     file_ext = os.path.splitext(file_name)[-1]
     save_path = os.path.join(output_path, f"{file_name}.json")
 
     if debug:
         logger.debug(f"FILE INFO:\nfile_path: {file_path}\nsave_path: {save_path}")
-        return {"file_docstring": "-"}, budget
+        return {"file_docstring": "-"}, cost
 
     if not os.path.exists(output_path):
         os.makedirs(output_path)
@@ -324,7 +323,7 @@ def process_file(
         code = file.read()
 
     if not code:
-        return {"file_docstring": ""}, budget
+        return {"file_docstring": ""}, cost
 
     if os.path.exists(save_path):
         logger.info(f"Skipping {file_name} as its tale file already exists.")
@@ -332,7 +331,7 @@ def process_file(
             found_tale = json.load(file)
         if fuse:
             fuse_documentation(code, found_tale, output_path, file_name, file_ext)
-        return found_tale, budget
+        return found_tale, cost
 
     if not file_ext or file_ext in ALLOWED_NO_CODE_EXTENSIONS:
         # a small single chunk is enough
@@ -341,15 +340,15 @@ def process_file(
             "file_name": file_name,
             "file_content": no_code_file,
         }
-        file_docstring, tokens = redact_tale_information(
+        file_docstring, call_cost = redact_tale_information(
             content_type="no-code-file",
             docs=no_code_file_data,
             model_name="text-davinci-003",
             is_estimation=is_estimation,
         )
-        budget += update_budget(tokens, "text-davinci-003")
+        cost += call_cost
 
-        return {"file_docstring": file_docstring}, budget
+        return {"file_docstring": file_docstring}, cost
 
     logger.info("split dev draft ideas")
     big_docs = split_code(code, language=LANGUAGES[file_ext], chunk_size=10000)
@@ -358,10 +357,10 @@ def process_file(
     logger.info("extract code elements")
     code_elements = []
     for idx, doc in enumerate(big_docs):
-        elements_set, tokens = extract_code_elements(
+        elements_set, call_cost = extract_code_elements(
             big_doc=doc, model_name=model_name, is_estimation=is_estimation
         )
-        budget += update_budget(tokens, model_name)
+        cost += call_cost
         if elements_set:
             code_elements.append(elements_set)
 
@@ -383,13 +382,13 @@ def process_file(
     # process only if we have elements to document
     if code_elements_copy or is_estimation:
         for idx, doc in enumerate(short_docs):
-            tale, tokens = get_unit_tale(
+            tale, call_cost = get_unit_tale(
                 short_doc=doc,
                 code_elements=code_elements_copy,
                 model_name=model_name,
                 is_estimation=is_estimation,
             )
-            budget += update_budget(tokens, model_name)
+            cost += call_cost
             tales_list.append(tale)
             logger.info(f"tale section {str(idx+1)}/{len(short_docs)} done.")
 
@@ -405,13 +404,13 @@ def process_file(
     logger.info("add dev tale summary")
     summaries = split_text(str(code_elements_dict["summary"]), chunk_size=9000)
 
-    file_docstring, tokens = redact_tale_information(
+    file_docstring, call_cost = redact_tale_information(
         content_type="top-level",
         docs=summaries,
         model_name="text-davinci-003",
         is_estimation=is_estimation,
     )
-    budget += update_budget(tokens, "text-davinci-003")
+    cost += call_cost
 
     if fuse and not is_estimation:
         # add docstring label only to insert it along the docstring into the code
@@ -426,7 +425,7 @@ def process_file(
         with open(save_path, "w") as json_file:
             json.dump(tale, json_file, indent=2)
 
-    return tale, budget
+    return tale, cost
 
 
 def fuse_documentation(code, tale, output_path, file_name, file_ext):
@@ -557,7 +556,7 @@ def main(
     if is_estimation:
         logger.info(f"Approximate cost: {price}")
     else:
-        logger.info(f"Cost: {price}")
+        logger.info(f"Total cost: {price}")
 
 
 if __name__ == "__main__":
diff --git a/devtale/utils.py b/devtale/utils.py
index a784cd8..ff9f9be 100644
--- a/devtale/utils.py
+++ b/devtale/utils.py
@@ -6,6 +6,7 @@
 
 import tiktoken
 from langchain import LLMChain, OpenAI, PromptTemplate
+from langchain.callbacks import get_openai_callback
 from langchain.chat_models import ChatOpenAI
 from langchain.output_parsers import PydanticOutputParser
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -31,18 +32,14 @@
 }
 
 
-def calc_tokens(input: str, model: str) -> int:
+def calculate_cost(input: str, model: str):
     if model == "davinci":
         encoding = "p50k_base"
     else:
         encoding = "cl100k_base"
 
     tokens = tiktoken.get_encoding(encoding).encode(input)
-    return len(tokens)
-
-
-def update_budget(n_tokens, model: str):
-    return (n_tokens / 1000) * GPT_PRICE[model]
+    return (len(tokens) / 1000) * GPT_PRICE[model]
 
 
 def split_text(text, chunk_size=1000, chunk_overlap=0):
@@ -72,12 +69,17 @@ def extract_code_elements(
         llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose
     )
 
-    tokens = calc_tokens(prompt.format(code=big_doc.page_content), model_name)
     if is_estimation:
-        return "", tokens
+        estimated_cost = calculate_cost(
+            prompt.format(code=big_doc.page_content), model_name
+        )
+        return "", estimated_cost
 
-    result_string = extractor({"code": big_doc.page_content})
-    return result_string["text"], tokens
+    with get_openai_callback() as cb:
+        result_string = extractor({"code": big_doc.page_content})
+        cost = cb.total_cost
+
+    return result_string["text"], cost
 
 
 def _process_extracted_code_element(text: str):
@@ -135,13 +137,17 @@ def redact_tale_information(
     else:
         information = str(docs)
 
-    tokens = calc_tokens(prompt.format(information=information), model_name)
-
     if is_estimation:
-        return "", tokens
+        estimated_cost = calculate_cost(
+            prompt.format(information=information), model_name
+        )
+        return "", estimated_cost
 
-    text_answer = teller_of_tales({"information": information})
-    return text_answer["text"], tokens
+    with get_openai_callback() as cb:
+        text_answer = teller_of_tales({"information": information})
+        cost = cb.total_cost
+
+    return text_answer["text"], cost
 
 
 def convert_to_json(text_answer):
@@ -182,21 +188,26 @@ def get_unit_tale(
         llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose
     )
 
-    tokens = calc_tokens(
-        prompt.format(code=short_doc.page_content, code_elements=str(code_elements)),
-        model_name,
-    )
     if is_estimation:
-        return {"classes": [], "methods": []}, tokens
+        estimated_cost = calculate_cost(
+            prompt.format(
+                code=short_doc.page_content, code_elements=str(code_elements)
+            ),
+            model_name,
+        )
+        return {"classes": [], "methods": []}, estimated_cost
+
+    with get_openai_callback() as cb:
+        result_string = teller_of_tales(
+            {"code": short_doc.page_content, "code_elements": code_elements}
+        )
+        cost = cb.total_cost
 
-    result_string = teller_of_tales(
-        {"code": short_doc.page_content, "code_elements": code_elements}
-    )
     json_answer = convert_to_json(result_string)
     if not json_answer:
         print("Returning empty JSON due to a failure")
         json_answer = {"classes": [], "methods": []}
-    return json_answer, tokens
+    return json_answer, cost
 
 
 def is_hallucination(code_definition, code, expected_definitions):

From 2cd6d10ad7671c5e7814fa8f8150703f3fb775fa Mon Sep 17 00:00:00 2001
From: Alberto Gaona <albertoo_3c@hotmail.com>
Date: Tue, 26 Sep 2023 07:05:53 -0600
Subject: [PATCH 4/4] fix review comments

---
 cli.py               | 49 ++++++++++++++++++++++----------------------
 devtale/constants.py |  1 +
 devtale/utils.py     | 14 ++++++-------
 3 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/cli.py b/cli.py
index 13e4110..1e77187 100644
--- a/cli.py
+++ b/cli.py
@@ -44,7 +44,7 @@ def process_repository(
     model_name: str = DEFAULT_MODEL_NAME,
     fuse: bool = False,
     debug: bool = False,
-    is_estimation: bool = True,
+    cost_estimation: bool = True,
 ) -> None:
     cost = 0
     folder_tales = {
@@ -101,7 +101,7 @@ def process_repository(
                 fuse=fuse,
                 debug=debug,
                 folder_full_name=folder_full_name,
-                is_estimation=is_estimation,
+                cost_estimation=cost_estimation,
             )
             cost += folder_cost
 
@@ -141,7 +141,7 @@ def process_repository(
             "root-level",
             folder_summaries,
             model_name="gpt-3.5-turbo-16k",
-            is_estimation=is_estimation,
+            cost_estimation=cost_estimation,
         )
         cost += call_cost
         root_readme = root_readme.replace("----------", "")
@@ -166,7 +166,7 @@ def process_repository(
 
             root_readme = root_readme + modified_original_readme
 
-        if not is_estimation:
+        if not cost_estimation:
             logger.info("save root json..")
             with open(os.path.join(output_path, "root_level.json"), "w") as json_file:
                 json.dump(folder_tales, json_file, indent=2)
@@ -187,7 +187,7 @@ def process_folder(
     fuse: bool = False,
     debug: bool = False,
     folder_full_name: str = None,
-    is_estimation: bool = False,
+    cost_estimation: bool = False,
 ) -> None:
     cost = 0
     save_path = os.path.join(output_path, os.path.basename(folder_path))
@@ -203,7 +203,7 @@ def process_folder(
             logger.info(f"processing {file_path}")
             try:
                 file_tale, file_cost = process_file(
-                    file_path, save_path, model_name, fuse, debug, is_estimation
+                    file_path, save_path, model_name, fuse, debug, cost_estimation
                 )
                 cost += file_cost
             except Exception as e:
@@ -270,7 +270,7 @@ def process_folder(
             "folder-level",
             files_summaries,
             model_name="gpt-3.5-turbo-16k",
-            is_estimation=is_estimation,
+            cost_estimation=cost_estimation,
         )
         folder_readme = folder_readme.replace("----------", "")
 
@@ -278,12 +278,12 @@ def process_folder(
             "folder-description",
             folder_readme,
             model_name="gpt-3.5-turbo-16k",
-            is_estimation=is_estimation,
+            cost_estimation=cost_estimation,
         )
 
         cost += fl_cost + fd_cost
 
-        if not is_estimation:
+        if not cost_estimation:
             logger.info("save folder json..")
             with open(os.path.join(save_path, "folder_level.json"), "w") as json_file:
                 json.dump(tales, json_file, indent=2)
@@ -304,7 +304,7 @@ def process_file(
     model_name: str = DEFAULT_MODEL_NAME,
     fuse: bool = False,
     debug: bool = False,
-    is_estimation: bool = False,
+    cost_estimation: bool = False,
 ) -> None:
     cost = 0
     file_name = os.path.basename(file_path)
@@ -344,7 +344,7 @@ def process_file(
             content_type="no-code-file",
             docs=no_code_file_data,
             model_name="text-davinci-003",
-            is_estimation=is_estimation,
+            cost_estimation=cost_estimation,
         )
         cost += call_cost
 
@@ -358,7 +358,7 @@ def process_file(
     code_elements = []
     for idx, doc in enumerate(big_docs):
         elements_set, call_cost = extract_code_elements(
-            big_doc=doc, model_name=model_name, is_estimation=is_estimation
+            big_doc=doc, model_name=model_name, cost_estimation=cost_estimation
         )
         cost += call_cost
         if elements_set:
@@ -380,13 +380,13 @@ def process_file(
     logger.info("create tale sections")
     tales_list = []
     # process only if we have elements to document
-    if code_elements_copy or is_estimation:
+    if code_elements_copy or cost_estimation:
         for idx, doc in enumerate(short_docs):
             tale, call_cost = get_unit_tale(
                 short_doc=doc,
                 code_elements=code_elements_copy,
                 model_name=model_name,
-                is_estimation=is_estimation,
+                cost_estimation=cost_estimation,
             )
             cost += call_cost
             tales_list.append(tale)
@@ -408,11 +408,11 @@ def process_file(
         content_type="top-level",
         docs=summaries,
         model_name="text-davinci-003",
-        is_estimation=is_estimation,
+        cost_estimation=cost_estimation,
     )
     cost += call_cost
 
-    if fuse and not is_estimation:
+    if fuse and not cost_estimation:
         # add docstring label only to insert it along the docstring into the code
         tale["file_docstring"] = DOCSTRING_LABEL + "\n" + file_docstring
         fuse_documentation(code, tale, output_path, file_name, file_ext)
@@ -421,7 +421,7 @@ def process_file(
 
     logger.info(f"save dev tale in: {save_path}")
 
-    if not is_estimation:
+    if not cost_estimation:
         with open(save_path, "w") as json_file:
             json.dump(tale, json_file, indent=2)
 
@@ -496,11 +496,10 @@ def fuse_documentation(code, tale, output_path, file_name, file_ext):
 )
 @click.option(
     "--estimation",
-    "is_estimation",
+    "cost_estimation",
     is_flag=True,
     default=False,
-    help="True to calculate an approximate cost of documenting your code without \
-          doing any GPT call",
+    help="When true, estimate the cost of openAI's API usage, without making any call",
 )
 def main(
     path: str,
@@ -509,7 +508,7 @@ def main(
     output_path: str = DEFAULT_OUTPUT_PATH,
     model_name: str = DEFAULT_MODEL_NAME,
     debug: bool = False,
-    is_estimation: bool = False,
+    cost_estimation: bool = False,
 ):
     load_dotenv()
 
@@ -527,7 +526,7 @@ def main(
                 model_name=model_name,
                 fuse=fuse,
                 debug=debug,
-                is_estimation=is_estimation,
+                cost_estimation=cost_estimation,
             )
         else:
             logger.info("Processing folder")
@@ -537,7 +536,7 @@ def main(
                 model_name=model_name,
                 fuse=fuse,
                 debug=debug,
-                is_estimation=is_estimation,
+                cost_estimation=cost_estimation,
             )
     elif os.path.isfile(path):
         logger.info("Processing file")
@@ -547,13 +546,13 @@ def main(
             model_name=model_name,
             fuse=fuse,
             debug=debug,
-            is_estimation=is_estimation,
+            cost_estimation=cost_estimation,
         )
 
     else:
         raise f"Invalid input path {path}. Path must be a directory or code file."
 
-    if is_estimation:
+    if cost_estimation:
         logger.info(f"Approximate cost: {price}")
     else:
         logger.info(f"Total cost: {price}")
diff --git a/devtale/constants.py b/devtale/constants.py
index 855f858..84462b9 100644
--- a/devtale/constants.py
+++ b/devtale/constants.py
@@ -14,4 +14,5 @@
 
 DOCSTRING_LABEL = "@DEVTALE-GENERATED:"
 
+# Extracted from https://openai.com/pricing on September 26th, 2023.
 GPT_PRICE = {"gpt-4": 0.03, "gpt-3.5-turbo-16k": 0.03, "text-davinci-003": 0.0015}
diff --git a/devtale/utils.py b/devtale/utils.py
index ff9f9be..ec7dd8e 100644
--- a/devtale/utils.py
+++ b/devtale/utils.py
@@ -33,7 +33,7 @@
 
 
 def calculate_cost(input: str, model: str):
-    if model == "davinci":
+    if model == "text-davinci-003":
         encoding = "p50k_base"
     else:
         encoding = "cl100k_base"
@@ -59,7 +59,7 @@ def split_code(code, language, chunk_size=1000, chunk_overlap=0):
 
 
 def extract_code_elements(
-    big_doc, verbose=False, model_name="gpt-4", is_estimation=False
+    big_doc, verbose=False, model_name="gpt-4", cost_estimation=False
 ):
     prompt = PromptTemplate(
         template=CODE_EXTRACTOR_TEMPLATE,
@@ -69,7 +69,7 @@ def extract_code_elements(
         llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose
     )
 
-    if is_estimation:
+    if cost_estimation:
         estimated_cost = calculate_cost(
             prompt.format(code=big_doc.page_content), model_name
         )
@@ -124,7 +124,7 @@ def redact_tale_information(
     docs,
     verbose=False,
     model_name="text-davinci-003",
-    is_estimation=False,
+    cost_estimation=False,
 ):
     prompt = PromptTemplate(
         template=TYPE_INFORMATION[content_type], input_variables=["information"]
@@ -137,7 +137,7 @@ def redact_tale_information(
     else:
         information = str(docs)
 
-    if is_estimation:
+    if cost_estimation:
         estimated_cost = calculate_cost(
             prompt.format(information=information), model_name
         )
@@ -176,7 +176,7 @@ def convert_to_json(text_answer):
 
 
 def get_unit_tale(
-    short_doc, code_elements, model_name="gpt-4", verbose=False, is_estimation=False
+    short_doc, code_elements, model_name="gpt-4", verbose=False, cost_estimation=False
 ):
     parser = PydanticOutputParser(pydantic_object=FileDocumentation)
     prompt = PromptTemplate(
@@ -188,7 +188,7 @@ def get_unit_tale(
         llm=ChatOpenAI(model_name=model_name), prompt=prompt, verbose=verbose
     )
 
-    if is_estimation:
+    if cost_estimation:
         estimated_cost = calculate_cost(
             prompt.format(
                 code=short_doc.page_content, code_elements=str(code_elements)