From 70159458d77e0edd41b4015def519e541444bce0 Mon Sep 17 00:00:00 2001 From: Alexander Gusman Date: Sat, 17 Jun 2023 08:16:42 +0200 Subject: [PATCH 1/8] feat: pass DEFAULT_MODEL and DEFAULT_MAX_TOKENS from CLI --- constants.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/constants.py b/constants.py index b7ccc11b4..cdc8c7230 100644 --- a/constants.py +++ b/constants.py @@ -1,4 +1,15 @@ +import os + EXTENSION_TO_SKIP = [".png",".jpg",".jpeg",".gif",".bmp",".svg",".ico",".tif",".tiff"] DEFAULT_DIR = "generated" -DEFAULT_MODEL = "gpt-3.5-turbo" # we recommend 'gpt-4' if you have it # gpt3.5 is going to be worse at generating code so we strongly recommend gpt4. i know most people dont have access, we are working on a hosted version -DEFAULT_MAX_TOKENS = 2000 # i wonder how to tweak this properly. we dont want it to be max length as it encourages verbosity of code. but too short and code also truncates suddenly. \ No newline at end of file +# https://platform.openai.com/docs/models/gpt-4 +try: + DEFAULT_MODEL = os.environ["OPENAI_DEFAULT_MODEL"] +except KeyError: + # we recommend 'gpt-4' if you have it # gpt3.5 is going to be worse at generating code so we strongly recommend gpt4. i know most people dont have access, we are working on a hosted version + DEFAULT_MODEL = "gpt-3.5-turbo" +try: + DEFAULT_MAX_TOKENS = int(os.environ["OPENAI_DEFAULT_MAX_TOKENS"]) +except KeyError: + # i wonder how to tweak this properly. we dont want it to be max length as it encourages verbosity of code. but too short and code also truncates suddenly. + DEFAULT_MAX_TOKENS = 2000 \ No newline at end of file From 781fa4664ae5b051c2b130315a93578b2b62df31 Mon Sep 17 00:00:00 2001 From: Alexander Gusman Date: Sat, 17 Jun 2023 09:31:51 +0200 Subject: [PATCH 2/8] feat: USE_FULL_PROJECT_PROMPT + extracting code from ``` USE_FULL_PROJECT_PROMPT If enabled for each file generation prompt we will include all the files generated before Also, sometimes GPT-3.5 still returns some words around the content of the file This change extracts code from ``` blocks Also, OPENAI_DEFAULT_MODEL and OPENAI_DEFAULT_MAX_TOKENS env vars added --- constants.py | 9 +++++++++ main.py | 32 +++++++++++++++++++++++++------- main_no_modal.py | 31 +++++++++++++++++++++++++------ 3 files changed, 59 insertions(+), 13 deletions(-) diff --git a/constants.py b/constants.py index cdc8c7230..baa3d9b57 100644 --- a/constants.py +++ b/constants.py @@ -2,6 +2,15 @@ EXTENSION_TO_SKIP = [".png",".jpg",".jpeg",".gif",".bmp",".svg",".ico",".tif",".tiff"] DEFAULT_DIR = "generated" + +try: + USE_FULL_PROJECT_PROMPT = bool(os.environ["USE_FULL_PROJECT_PROMPT"]) +except KeyError: + # If enabled for each file generation prompt we will include all the files generated before + # It helps to make code much more consistent + # But requires at least 16k context model even for a small project + USE_FULL_PROJECT_PROMPT = False + # https://platform.openai.com/docs/models/gpt-4 try: DEFAULT_MODEL = os.environ["OPENAI_DEFAULT_MODEL"] diff --git a/main.py b/main.py index 9d7ecd5db..59251e3f2 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,9 @@ import os +import re import modal import ast from utils import clean_dir -from constants import DEFAULT_DIR, DEFAULT_MODEL, DEFAULT_MAX_TOKENS +from constants import DEFAULT_DIR, DEFAULT_MODEL, DEFAULT_MAX_TOKENS, USE_FULL_PROJECT_PROMPT stub = modal.Stub("smol-developer-v1") # yes we are recommending using Modal by default, as it helps with deployment. see readme for why. openai_image = modal.Image.debian_slim().pip_install("openai", "tiktoken") @@ -60,17 +61,18 @@ def reportTokens(prompt): @stub.function() -def generate_file(filename, model=DEFAULT_MODEL, filepaths_string=None, shared_dependencies=None, prompt=None): +def generate_file(filename, model=DEFAULT_MODEL, filepaths_string=None, shared_dependencies=None, prompt=None, generatedFilesContent=None): # call openai api with this prompt filecode = generate_response.call(model, f"""You are an AI developer who is trying to write a program that will generate code for the user based on their intent. - + the app is: {prompt} the files we have decided to generate are: {filepaths_string} - the shared dependencies (like filenames and variable names) we have decided on are: {shared_dependencies} - + the shared dependencies (like filenames and variable names) we have decided on are: {shared_dependencies}""" + + (f"already generated files are:\n {generatedFilesContent}" if (USE_FULL_PROJECT_PROMPT and generatedFilesContent) else "") + + f""" only write valid code for the given filepath and file type, and return only the code. do not add any other explanation, only return valid code for that file type. """, @@ -97,7 +99,7 @@ def generate_file(filename, model=DEFAULT_MODEL, filepaths_string=None, shared_d """, ) - return filename, filecode + return filename, get_code_from_string(filecode) @stub.local_entrypoint() @@ -163,17 +165,33 @@ def main(prompt, directory=DEFAULT_DIR, model=DEFAULT_MODEL, file=None): print(shared_dependencies) # write shared dependencies as a md file inside the generated directory write_file("shared_dependencies.md", shared_dependencies, directory) - + generated_files_content = "" # Iterate over generated files and write them to the specified directory for filename, filecode in generate_file.map( list_actual, order_outputs=False, kwargs=dict(model=model, filepaths_string=filepaths_string, shared_dependencies=shared_dependencies, prompt=prompt) ): write_file(filename, filecode, directory) + generated_files_content += f"{directory}/{filename}\n" + generated_files_content += "\n" + generated_files_content += filecode + generated_files_content += "\n" except ValueError: print("Failed to parse result") +# sometimes GPT-3.5 still returns some words around the content of the file +# example: +# # Makefile +# ```makefile +# contents +# ```` +def get_code_from_string(input_string): + match = re.search(r'```[^\n]+?\n([\s\S]+?)\n```', input_string) + if match: + return match.group(1) + else: + return input_string def write_file(filename, filecode, directory): # Output the filename in blue color diff --git a/main_no_modal.py b/main_no_modal.py index b3edba516..810577c6a 100644 --- a/main_no_modal.py +++ b/main_no_modal.py @@ -1,9 +1,10 @@ import sys import os +import re import ast from time import sleep from utils import clean_dir -from constants import DEFAULT_DIR, DEFAULT_MODEL, DEFAULT_MAX_TOKENS +from constants import DEFAULT_DIR, DEFAULT_MODEL, DEFAULT_MAX_TOKENS, USE_FULL_PROJECT_PROMPT def generate_response(system_prompt, user_prompt, *args): import openai @@ -62,7 +63,7 @@ def reportTokens(prompt): def generate_file( - filename, filepaths_string=None, shared_dependencies=None, prompt=None + filename, filepaths_string=None, shared_dependencies=None, prompt=None, generatedFilesContent=None ): # call openai api with this prompt filecode = generate_response( @@ -72,8 +73,9 @@ def generate_file( the files we have decided to generate are: {filepaths_string} - the shared dependencies (like filenames and variable names) we have decided on are: {shared_dependencies} - + the shared dependencies (like filenames and variable names) we have decided on are: {shared_dependencies}""" + + (f"already generated files are:\n {generatedFilesContent}" if (USE_FULL_PROJECT_PROMPT and generatedFilesContent) else "") + + f""" only write valid code for the given filepath and file type, and return only the code. do not add any other explanation, only return valid code for that file type. """, @@ -100,7 +102,7 @@ def generate_file( """, ) - return filename, filecode + return filename, get_code_from_string(filecode) def main(prompt, directory=DEFAULT_DIR, file=None): @@ -174,7 +176,7 @@ def main(prompt, directory=DEFAULT_DIR, file=None): print(shared_dependencies) # write shared dependencies as a md file inside the generated directory write_file("shared_dependencies.md", shared_dependencies, directory) - + generated_files_content = "" for name in list_actual: filename, filecode = generate_file( name, @@ -183,10 +185,27 @@ def main(prompt, directory=DEFAULT_DIR, file=None): prompt=prompt, ) write_file(filename, filecode, directory) + generated_files_content += f"{directory}/{filename}\n" + generated_files_content += "\n" + generated_files_content += filecode + generated_files_content += "\n" + except ValueError: print("Failed to parse result: " + result) +# sometimes GPT-3.5 still returns some words around the content of the file +# example: +# # Makefile +# ```makefile +# contents +# ```` +def get_code_from_string(input_string): + match = re.search(r'```[^\n]+?\n([\s\S]+?)\n```', input_string) + if match: + return match.group(1) + else: + return input_string def write_file(filename, filecode, directory): # Output the filename in blue color From 141732335d19d1dc159e9b908c78bfced47465e1 Mon Sep 17 00:00:00 2001 From: Alexander Gusman Date: Sat, 17 Jun 2023 11:05:23 +0200 Subject: [PATCH 3/8] fix: pass the generated files --- main.py | 6 +++--- main_no_modal.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 59251e3f2..4a41a0b8e 100644 --- a/main.py +++ b/main.py @@ -61,7 +61,7 @@ def reportTokens(prompt): @stub.function() -def generate_file(filename, model=DEFAULT_MODEL, filepaths_string=None, shared_dependencies=None, prompt=None, generatedFilesContent=None): +def generate_file(filename, model=DEFAULT_MODEL, filepaths_string=None, shared_dependencies=None, prompt=None, generated_files_content=None): # call openai api with this prompt filecode = generate_response.call(model, f"""You are an AI developer who is trying to write a program that will generate code for the user based on their intent. @@ -71,7 +71,7 @@ def generate_file(filename, model=DEFAULT_MODEL, filepaths_string=None, shared_d the files we have decided to generate are: {filepaths_string} the shared dependencies (like filenames and variable names) we have decided on are: {shared_dependencies}""" + - (f"already generated files are:\n {generatedFilesContent}" if (USE_FULL_PROJECT_PROMPT and generatedFilesContent) else "") + + (f"already generated files are:\n {generated_files_content}" if (USE_FULL_PROJECT_PROMPT and generated_files_content) else "") + f""" only write valid code for the given filepath and file type, and return only the code. do not add any other explanation, only return valid code for that file type. @@ -168,7 +168,7 @@ def main(prompt, directory=DEFAULT_DIR, model=DEFAULT_MODEL, file=None): generated_files_content = "" # Iterate over generated files and write them to the specified directory for filename, filecode in generate_file.map( - list_actual, order_outputs=False, kwargs=dict(model=model, filepaths_string=filepaths_string, shared_dependencies=shared_dependencies, prompt=prompt) + list_actual, order_outputs=False, kwargs=dict(model=model, filepaths_string=filepaths_string, shared_dependencies=shared_dependencies, prompt=prompt, generated_files_content=generated_files_content) ): write_file(filename, filecode, directory) generated_files_content += f"{directory}/{filename}\n" diff --git a/main_no_modal.py b/main_no_modal.py index 810577c6a..bea1a30ca 100644 --- a/main_no_modal.py +++ b/main_no_modal.py @@ -63,7 +63,7 @@ def reportTokens(prompt): def generate_file( - filename, filepaths_string=None, shared_dependencies=None, prompt=None, generatedFilesContent=None + filename, filepaths_string=None, shared_dependencies=None, prompt=None, generated_files_content=None ): # call openai api with this prompt filecode = generate_response( @@ -74,7 +74,7 @@ def generate_file( the files we have decided to generate are: {filepaths_string} the shared dependencies (like filenames and variable names) we have decided on are: {shared_dependencies}""" + - (f"already generated files are:\n {generatedFilesContent}" if (USE_FULL_PROJECT_PROMPT and generatedFilesContent) else "") + + (f"already generated files are:\n {generated_files_content}" if (USE_FULL_PROJECT_PROMPT and generated_files_content) else "") + f""" only write valid code for the given filepath and file type, and return only the code. do not add any other explanation, only return valid code for that file type. @@ -183,6 +183,7 @@ def main(prompt, directory=DEFAULT_DIR, file=None): filepaths_string=filepaths_string, shared_dependencies=shared_dependencies, prompt=prompt, + generated_files_content=generated_files_content, ) write_file(filename, filecode, directory) generated_files_content += f"{directory}/{filename}\n" From 75f2ea79004bc1b8e5d1aa493acf5df36982d81e Mon Sep 17 00:00:00 2001 From: Alexander Gusman Date: Mon, 19 Jun 2023 09:37:34 +0300 Subject: [PATCH 4/8] fix(code_extract): should extract even from ``` blocks --- main.py | 2 +- main_no_modal.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/main.py b/main.py index 4a41a0b8e..55f2d5981 100644 --- a/main.py +++ b/main.py @@ -187,7 +187,7 @@ def main(prompt, directory=DEFAULT_DIR, model=DEFAULT_MODEL, file=None): # contents # ```` def get_code_from_string(input_string): - match = re.search(r'```[^\n]+?\n([\s\S]+?)\n```', input_string) + match = re.search(r'```[^\n]*?\n([\s\S]+?)\n```', input_string) if match: return match.group(1) else: diff --git a/main_no_modal.py b/main_no_modal.py index bea1a30ca..6ec2b2bd7 100644 --- a/main_no_modal.py +++ b/main_no_modal.py @@ -7,7 +7,7 @@ from constants import DEFAULT_DIR, DEFAULT_MODEL, DEFAULT_MAX_TOKENS, USE_FULL_PROJECT_PROMPT def generate_response(system_prompt, user_prompt, *args): - import openai + import genstudiopy import tiktoken def reportTokens(prompt): @@ -24,7 +24,7 @@ def reportTokens(prompt): ) # Set up your OpenAI API credentials - openai.api_key = os.environ["OPENAI_API_KEY"] + # genstudiopy.api_key = os.environ["OPENAI_API_KEY"] messages = [] messages.append({"role": "system", "content": system_prompt}) @@ -41,7 +41,7 @@ def reportTokens(prompt): params = { "model": DEFAULT_MODEL, "messages": messages, - "max_tokens": DEFAULT_MAX_TOKENS, + # "max_tokens": DEFAULT_MAX_TOKENS, "temperature": 0, } @@ -49,7 +49,7 @@ def reportTokens(prompt): keep_trying = True while keep_trying: try: - response = openai.ChatCompletion.create(**params) + response = genstudiopy.ChatCompletion.create(**params) keep_trying = False except Exception as e: # e.g. when the API is too busy, we don't want to fail everything @@ -202,7 +202,7 @@ def main(prompt, directory=DEFAULT_DIR, file=None): # contents # ```` def get_code_from_string(input_string): - match = re.search(r'```[^\n]+?\n([\s\S]+?)\n```', input_string) + match = re.search(r'```[^\n]*?\n([\s\S]+?)\n```', input_string) if match: return match.group(1) else: From cd58f9f2e87e25cc16a8c105e2cfb9d062eb4a27 Mon Sep 17 00:00:00 2001 From: Alexander Gusman Date: Mon, 19 Jun 2023 10:31:22 +0300 Subject: [PATCH 5/8] fix(code_extract): should extract even from ``` blocks --- main.py | 2 +- main_no_modal.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 4a41a0b8e..55f2d5981 100644 --- a/main.py +++ b/main.py @@ -187,7 +187,7 @@ def main(prompt, directory=DEFAULT_DIR, model=DEFAULT_MODEL, file=None): # contents # ```` def get_code_from_string(input_string): - match = re.search(r'```[^\n]+?\n([\s\S]+?)\n```', input_string) + match = re.search(r'```[^\n]*?\n([\s\S]+?)\n```', input_string) if match: return match.group(1) else: diff --git a/main_no_modal.py b/main_no_modal.py index bea1a30ca..d75db5026 100644 --- a/main_no_modal.py +++ b/main_no_modal.py @@ -202,7 +202,7 @@ def main(prompt, directory=DEFAULT_DIR, file=None): # contents # ```` def get_code_from_string(input_string): - match = re.search(r'```[^\n]+?\n([\s\S]+?)\n```', input_string) + match = re.search(r'```[^\n]*?\n([\s\S]+?)\n```', input_string) if match: return match.group(1) else: From 42e477b0c1bddd5cc4d3bafa0638e34035e7fac4 Mon Sep 17 00:00:00 2001 From: Alexander Gusman Date: Wed, 21 Jun 2023 16:00:49 +0300 Subject: [PATCH 6/8] fix: if DEFAULT_MAX_TOKENS is 0, skip this field --- main.py | 4 +++- main_no_modal.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 55f2d5981..61f50882c 100644 --- a/main.py +++ b/main.py @@ -48,10 +48,12 @@ def reportTokens(prompt): params = { "model": model, "messages": messages, - "max_tokens": DEFAULT_MAX_TOKENS, "temperature": 0, } + if DEFAULT_MAX_TOKENS != 0: + params.max_tokens = DEFAULT_MAX_TOKENS + # Send the API request response = openai.ChatCompletion.create(**params) diff --git a/main_no_modal.py b/main_no_modal.py index d75db5026..15df37e53 100644 --- a/main_no_modal.py +++ b/main_no_modal.py @@ -41,10 +41,12 @@ def reportTokens(prompt): params = { "model": DEFAULT_MODEL, "messages": messages, - "max_tokens": DEFAULT_MAX_TOKENS, "temperature": 0, } + if DEFAULT_MAX_TOKENS != 0: + params.max_tokens = DEFAULT_MAX_TOKENS + # Send the API request keep_trying = True while keep_trying: From 952060f1ffb2dffce1a9050124f4f4a60e390e91 Mon Sep 17 00:00:00 2001 From: Alexander Gusman Date: Wed, 21 Jun 2023 17:33:14 +0300 Subject: [PATCH 7/8] fix: revert unnecessary changes --- main_no_modal.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main_no_modal.py b/main_no_modal.py index c9492a6f8..15df37e53 100644 --- a/main_no_modal.py +++ b/main_no_modal.py @@ -7,7 +7,7 @@ from constants import DEFAULT_DIR, DEFAULT_MODEL, DEFAULT_MAX_TOKENS, USE_FULL_PROJECT_PROMPT def generate_response(system_prompt, user_prompt, *args): - import genstudiopy + import openai import tiktoken def reportTokens(prompt): @@ -24,7 +24,7 @@ def reportTokens(prompt): ) # Set up your OpenAI API credentials - # genstudiopy.api_key = os.environ["OPENAI_API_KEY"] + openai.api_key = os.environ["OPENAI_API_KEY"] messages = [] messages.append({"role": "system", "content": system_prompt}) @@ -51,7 +51,7 @@ def reportTokens(prompt): keep_trying = True while keep_trying: try: - response = genstudiopy.ChatCompletion.create(**params) + response = openai.ChatCompletion.create(**params) keep_trying = False except Exception as e: # e.g. when the API is too busy, we don't want to fail everything From 22dcf975d2cdc8fa00f7e6b923fe1b6a72fe3ccf Mon Sep 17 00:00:00 2001 From: Alexander Gusman Date: Thu, 22 Jun 2023 13:19:04 +0300 Subject: [PATCH 8/8] fix(USE_FULL_PROJECT_PROMPT): 0 or False should also be false --- constants.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/constants.py b/constants.py index baa3d9b57..2717255d9 100644 --- a/constants.py +++ b/constants.py @@ -4,13 +4,15 @@ DEFAULT_DIR = "generated" try: - USE_FULL_PROJECT_PROMPT = bool(os.environ["USE_FULL_PROJECT_PROMPT"]) + USE_FULL_PROJECT_PROMPT = bool(os.environ["USE_FULL_PROJECT_PROMPT"]) and os.environ["USE_FULL_PROJECT_PROMPT"] != 'False' and os.environ["USE_FULL_PROJECT_PROMPT"] != "0" except KeyError: # If enabled for each file generation prompt we will include all the files generated before # It helps to make code much more consistent # But requires at least 16k context model even for a small project USE_FULL_PROJECT_PROMPT = False +print(USE_FULL_PROJECT_PROMPT) + # https://platform.openai.com/docs/models/gpt-4 try: DEFAULT_MODEL = os.environ["OPENAI_DEFAULT_MODEL"]