From 3169c5acd1963936e33930d234390a09bcf8ebd3 Mon Sep 17 00:00:00 2001 From: innovation64 Date: Sat, 17 Feb 2024 22:24:13 +0800 Subject: [PATCH 1/3] fix: deleted the readme generation --- repo_agent/readme_generator/config.yml | 4 - repo_agent/readme_generator/dataacq.py | 79 ------------ repo_agent/readme_generator/genterated.py | 46 ------- repo_agent/readme_generator/readmegen.py | 80 ------------ repo_agent/readme_generator/runner.py | 19 --- repo_agent/readme_generator/summary.py | 135 --------------------- repo_agent/readme_generator/template.yml | 98 --------------- repo_agent/readme_generator/userappend.yml | 2 - 8 files changed, 463 deletions(-) delete mode 100644 repo_agent/readme_generator/config.yml delete mode 100644 repo_agent/readme_generator/dataacq.py delete mode 100644 repo_agent/readme_generator/genterated.py delete mode 100644 repo_agent/readme_generator/readmegen.py delete mode 100644 repo_agent/readme_generator/runner.py delete mode 100644 repo_agent/readme_generator/summary.py delete mode 100644 repo_agent/readme_generator/template.yml delete mode 100644 repo_agent/readme_generator/userappend.yml diff --git a/repo_agent/readme_generator/config.yml b/repo_agent/readme_generator/config.yml deleted file mode 100644 index d024116..0000000 --- a/repo_agent/readme_generator/config.yml +++ /dev/null @@ -1,4 +0,0 @@ -repo : "" -api_key : "" -api_base : "" -markdownrepo : "" \ No newline at end of file diff --git a/repo_agent/readme_generator/dataacq.py b/repo_agent/readme_generator/dataacq.py deleted file mode 100644 index af62c44..0000000 --- a/repo_agent/readme_generator/dataacq.py +++ /dev/null @@ -1,79 +0,0 @@ -import os -import git - -class DataAquire: - def __init__(self, directory): - self.directory = directory - def build_tree(self,files): - tree = {} - for file in files: - parts = file.split('/') - node = tree - for part in parts: - if part not in node: - node[part] = {} - node = node[part] - return tree - - def build_tree_string(self,node, prefix=''): - tree_str = '' - if not isinstance(node, dict) or not node: - return tree_str - - last_key = list(node.keys())[-1] - for key in node: - connector = '└── ' if key == last_key else '├── ' - tree_str += prefix + connector + key + '\n' - if isinstance(node[key], dict): - extension = ' ' if key == last_key else '│ ' - tree_str += self.build_tree_string(node[key], prefix=prefix + extension) - - return tree_str - def extract_requirements(self): - path = os.path.join(self.directory, 'requirements.txt') - if os.path.isfile(path): - with open(path, 'r',encoding = "utf-8") as file: - return file.read() - return "无法找到 requirements.txt" - - def extract_config_guide(self): - path = os.path.join(self.directory, 'config.yml') - if os.path.isfile(path): - with open(path, 'r',encoding = "utf-8") as file: - return file.read() - return "无法找到 config.yml" - - - def extract_license(self): - path = os.path.join(self.directory, 'LICENSE') - if os.path.isfile(path): - with open(path, 'r', encoding="utf-8") as file: - lines = file.readlines() - # 仅选择前3行 - useful_lines = lines[:3] - return ''.join(useful_lines) - return "无法找到 LICENSE 文件" - - - def top5(self): - repo = git.Repo(self.directory) - info={} - info['commits'] = [{'commit': commit.hexsha, 'message': commit.message.strip()} for commit in repo.iter_commits()] - top_five_messages = [msg['message'] for msg in info['commits'][:5]] - return top_five_messages - - def summaryinfo(self): - with open('./compressmd/summary.md', 'r', encoding = "utf-8") as f: - summary = f.read() - return summary - - def tree(self): - repo = git.Repo(self.directory) - files = [item.path for item in repo.tree().traverse()] - tree_string = self.build_tree_string(self.build_tree(files)) - return tree_string - -if __name__ == '__main__': - path="../../" - data =DataAquire(path) - print(data.summaryinfo()) diff --git a/repo_agent/readme_generator/genterated.py b/repo_agent/readme_generator/genterated.py deleted file mode 100644 index 2e431a5..0000000 --- a/repo_agent/readme_generator/genterated.py +++ /dev/null @@ -1,46 +0,0 @@ -from llama_index.llms import OpenAI - - -class OpenChatSummary: - def __init__(self, api_key,api_base): - self.llm = OpenAI(api_key=api_key, api_base=api_base) - self.client = OpenAI(api_key=api_key, api_base=api_base,model="gpt-3.5-turbo-1106") - def processname(self, value,structure, template): - information = "\n\n".join(value) - messages = f"base the following information: {information},here is the repo structure {structure},to fill the template: {template},must be in markdown format,the ouptut is only programe name ,max token no more than 15 words" - response = self.llm.complete(messages) - content = response - return content - def processdes(self, value, structure, template): - # Step 1: Create an overview (总体概览) - overview = f"This document will provide an overview of the project, including its structure and the template requirements. The main aspects are as follows: {', '.join(value)}." - # Step 2: Detailed description (详细描述) - detailed_description = "\n\n".join([f"**{item}:** Detailed information about '{item}'." for item in value]) - # Step 3: Structure and template information (项目结构和模板信息) - structure_and_template = f"The project's structure is as follows: {structure}. The template to be filled is: {template}." - # Step 4: Conclusion (总结) - conclusion = "In summary, the above details provide a comprehensive view of the project, its structure, and template requirements." - # Combining all parts - messages = f"{overview}\n\n{detailed_description}\n\n{structure_and_template}\n\n{conclusion}\n\nNote: The response must be in markdown format and should not exceed 350 words." - # Get response from language model - response = self.llm.complete(messages) - content = response - return content - def processlinces(self, value, template): - information = "\n\n".join(value) - messages = f"base the following information: {information},to fill the template: {template},must be in markdown format ,Your output must strictly adhere to the template style , the linces url is like https://choosealicense.com/licenses/XXXX, XXX is the license name" - response = self.llm.complete(messages) - content = response - return content - def processother(self, value,structure, template): - information = "\n\n".join(value) - messages = f"base the following information: {information},here is the repo structure {structure},to fill the template: {template},must be in markdown format ,max token no more than 450 words,Your output must strictly adhere to the template style" - response = self.llm.complete(messages) - content = response - return content - def refactor(self, value): - information = "\n\n".join(value) - messages = f"Reconstruct the new README based on the existing README, remove the useless information in it, and rearrange the format. The output must be in the markdwon format of README.md.Here is the existing README: {information}" - response = self.client.complete(messages) - content = response - return content \ No newline at end of file diff --git a/repo_agent/readme_generator/readmegen.py b/repo_agent/readme_generator/readmegen.py deleted file mode 100644 index 09bcad0..0000000 --- a/repo_agent/readme_generator/readmegen.py +++ /dev/null @@ -1,80 +0,0 @@ -import yaml -from genterated import OpenChatSummary -from dataacq import DataAquire -from llama_index.llms import OpenAI -from loguru import logger - -logger.add("./log.txt", level="DEBUG", format="{time} - {name} - {level} - {message}") - - -def read_yaml(file_path): - """ - Read a YAML file and return the data. - """ - with open(file_path, 'r', encoding='utf-8') as file: - return yaml.safe_load(file) - -def create_readme(yaml_data, output_file): - """ - Create a README file from YAML data. - """ - readme_content = "" - - # 遍历YAML数据,将每个键值对转换为Markdown格式 - for key, value in yaml_data.items(): - appendix_content = read_yaml("./userappend.yml")["appendix"] - appendix_content = "This is additional information "+appendix_content - if key.lower() == 'name': - info = str(data.summaryinfo())+appendix_content - valuea=clients.processname(info,str(data.tree()),value) - logger.debug(f"名字: {info}") - # logger.debug(f"循环完: {str(data.tree())}") - readme_content += f"{valuea}\n\n" - # logger.debug(f"循环完: {readme_content}") - if key.lower() == 'description': - info = str(data.summaryinfo()) + appendix_content - logger.debug(f"描述: {info}") - valuea=clients.processdes(info,str(data.tree()),value) - readme_content += f">{valuea}\n\n" - if key.lower() == 'license': - info = str(data.extract_license()) + appendix_content - valuea=clients.processlinces(info,value) - logger.debug(f"许可: {valuea}") - readme_content += f"{valuea}\n\n" - if key.lower() == 'installation': - info = str(data.extract_requirements()) + appendix_content - valuea=clients.processother(info,str(data.tree()),value) - readme_content += f"{valuea}\n\n" - if key.lower() == 'usage': - info = str(data.extract_config_guide()) + str(data.summaryinfo()) + appendix_content - valuea=clients.processother(info,str(data.tree()),value) - readme_content += f"{valuea}\n\n" - appendix_functions = ['badges', 'visuals','support', 'roadmap', 'authors_and_acknowledgment', 'project_status', 'citation'] - for section in appendix_functions: - if section in appendix_content: - valuea=clients.processother(appendix_content,str(data.tree()),value) - readme_content += f"{valuea}\n\n" - # 调用对应的函数 - # 将生成的内容写入到README文件中 - logger.debug(f"重构前: {readme_content}") - readme_content = clients.refactor(readme_content) - logger.debug(f"重构完: {readme_content}") - readme_content = str(readme_content) - with open(output_file, "w", encoding="utf-8") as file: - file.write(readme_content) - - print(f"{output_file} file created successfully.") - - -if __name__ == "__main__": - config = read_yaml("config.yml") - api_key = config['api_key'] - api_base = config['api_base'] - path = config['repo'] - clients = OpenChatSummary(api_key, api_base) - data = DataAquire(path) - # 读取YAML文件 - yaml_data = read_yaml("./template.yml") - - # 创建README文件 - create_readme(yaml_data, "README.md") diff --git a/repo_agent/readme_generator/runner.py b/repo_agent/readme_generator/runner.py deleted file mode 100644 index 8879187..0000000 --- a/repo_agent/readme_generator/runner.py +++ /dev/null @@ -1,19 +0,0 @@ -import subprocess - -# 定义要运行的两个 Python 文件的文件名 -file1 = "summary.py" -file2 = "readmegen.py" - -# 运行第一个 Python 文件 -result1 = subprocess.run(["python", file1], capture_output=True, text=True, check=True) - -# 打印第一个文件的输出 -print("Output of summary.py:") -print(result1.stdout) - -# 运行第二个 Python 文件 -result2 = subprocess.run(["python", file2], capture_output=True, text=True, check=True) - -# 打印第二个文件的输出 -print("\nOutput of readmegen.py:") -print(result2.stdout) diff --git a/repo_agent/readme_generator/summary.py b/repo_agent/readme_generator/summary.py deleted file mode 100644 index d6c652d..0000000 --- a/repo_agent/readme_generator/summary.py +++ /dev/null @@ -1,135 +0,0 @@ -import os -import openai -from openai import OpenAI -from loguru import logger -import yaml - -logger.add("./log.txt", level="DEBUG", format="{time} - {name} - {level} - {message}") - - -def read_yaml(file_path): - """ - Read a YAML file and return the data. - """ - with open(file_path, 'r', encoding='utf-8') as file: - return yaml.safe_load(file) - -def generate_summary(text): - - prompt="Remember I noly need the descripions about program, the usage info, please Summarize the following text:\n\n" + text - completion = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": prompt} - - ] - ) - - return completion.choices[0].message.content - -def process_content(original_content): - # 这里可以添加您自己的内容处理逻辑 - # 现在仅仅是复制原内容 - prompt = """ - Extract key information from the provided functional description for use in writing a README file. Pay attention to the following points: - - 1. **Name of class or library**: Identify the name of the class or library mentioned in the description. - 2. **Main functions**: Summarize the core functions of the class or library, that is, what it is used for. - 3. **Initialization method** (if mentioned): Extracts information about the instance of the initialized class, such as the constructor and its parameters. - 4. **Dependencies and Third-Party Libraries**: Pay attention to any dependencies or third-party libraries mentioned in the description, this is very important for setup and installation. - 5. **Main methods and operations**: - - Determine the name of each major method or operation. - - Extract the purpose, input parameters, and return value of each method. - 6. **Sample Usage** (if available): Look for any code examples or usage instructions that may be included in the description. - 7. **Installation and Configuration Requirements** (if mentioned): Pay attention to any instructions on how to install or configure a class or library. - 8. **Additional Resources** (if mentioned): Include links to additional documentation, community forums, or other relevant resources. - - The focus is on extracting the precision and usefulness of this information in order to integrate it into a structured and informative README document. - Here is an output example: - ' - ClassDef XXX: - - Function description: XXX - Initialization function __init__: receives the code library path repo_path, creates and saves the git warehouse object as the repo attribute. - Third-party libraries used: git, subprocess, re. - Method get_staged_pys: - - Function: Get the staged Python file changes in the warehouse. - Return: Returns a dictionary, the key is the file path, and the value is a Boolean value indicating whether the file is newly created. - Method get_changed_pys: - - Function: Get the changed Python files in the warehouse, including unstaged changes and untracked files. - Returns: Returns a dictionary with the same structure as get_staged_pys. - Method get_file_diff: - - - ' - if we can't extract any relevant info return 'there's no relevant info' and igore the brefore instruct - below is provided functional description - - """ - prompt_content = prompt+original_content - completion = client.chat.completions.create( - model="gpt-4-32k", - messages=[ - {"role": "system", "content": prompt_content} - ] - ) - - return completion.choices[0].message.content - -def create_new_docs(source_directory, target_directory): - # 确保目标目录存在 - if not os.path.exists(target_directory): - os.makedirs(target_directory) - - # 遍历源目录下的所有文件 - for filename in os.listdir(source_directory): - source_file = os.path.join(source_directory, filename) - - # 检查是否为文件 - if os.path.isfile(source_file): - # 读取原文件内容 - with open(source_file, 'r', encoding = 'utf-8') as file: - content = file.read() - - # 处理内容 - processed_content = str(process_content(content)) - - # 创建新文件名并保存在目标目录 - new_filename = os.path.splitext(filename)[0] + "_new.md" - target_file = os.path.join(target_directory, new_filename) - - # 写入处理后的内容到新文件 - with open(target_file, 'w', encoding = 'utf-8') as new_file: - new_file.write(processed_content) - -def process_folder(folder_path): - files = os.listdir(folder_path) - combined_summary = "" - - for file in files: - file_path = os.path.join(folder_path, file) - - with open(file_path, 'r' , encoding = 'utf-8') as f: - content = f.read() - logger.debug(f"Questions: {content}") - current_summary = str(generate_summary(content)) - logger.debug(f"Questions: {current_summary}") - combined_summary = str(generate_summary(combined_summary + " " + current_summary)) - with open(os.path.join(folder_path, "summary.md"), 'w', encoding = 'utf-8') as f: - f.write(combined_summary) - - -if __name__ == "__main__": - config = read_yaml("config.yml") - api_key = config['api_key'] - api_base = config['api_base'] - client = OpenAI(api_key=api_key, base_url=api_base) - # 调用函数,处理指定文件夹 - folder_path = config['markdownrepo'] - target_directory = './compressmd' - os.makedirs(target_directory, exist_ok=True) - - create_new_docs(folder_path, target_directory) - process_folder(target_directory) diff --git a/repo_agent/readme_generator/template.yml b/repo_agent/readme_generator/template.yml deleted file mode 100644 index 6ccdb0f..0000000 --- a/repo_agent/readme_generator/template.yml +++ /dev/null @@ -1,98 +0,0 @@ - -name : | - # Name - - -Description : | - - >A brief description of what this project does and who it's for... - - -Badges : | - - ## Badges - - Add badges from somewhere like: [shields.io](https://shields.io/) - - [![MIT License](https://img.shields.io/badge/License-MIT-green.svg)](https://choosealicense.com/licenses/mit/) - [![GPLv3 License](https://img.shields.io/badge/License-GPL%20v3-yellow.svg)](https://opensource.org/licenses/) - [![AGPL License](https://img.shields.io/badge/license-AGPL-blue.svg)](http://www.gnu.org/licenses/agpl-3.0) - - - -Visuals : | - ## Visuals - - Add images from somewhere like: [imgur.com](https://imgur.com/) - - - -Installation : | - - ## Installation - - Install my-project with npm - - ```bash - npm install my-project - cd my-project - ``` - -Usage : | - - - ## Usage/Examples - - ```javascript - import Component from 'my-project' - - function App() { - return - } - ``` - - - - -Support : | - - ## Support - - For support, email fake@fake.com or join our Slack channel. - - - -Roadmap : | - - - -Contributing : | - - - -Authors_and_acknowledgment : | - ## Authors - - - [@octokatherine](https://www.github.com/octokatherine) - - -License : | - - ## License - - [MIT](https://choosealicense.com/licenses/mit/) - - - -Project_status : | - - - -citation : | - ## citation - @article{zeng2022glm, - title={Glm-130b: An open bilingual pre-trained model}, - author={Zeng, Aohan and Liu, Xiao and Du, Zhengxiao and Wang, Zihan and Lai, Hanyu and Ding, Ming and Yang, Zhuoyi and Xu, Yifan and Zheng, Wendi and Xia, Xiao and others}, - journal={arXiv preprint arXiv:2210.02414}, - year={2022} - } \ No newline at end of file diff --git a/repo_agent/readme_generator/userappend.yml b/repo_agent/readme_generator/userappend.yml deleted file mode 100644 index 35e121e..0000000 --- a/repo_agent/readme_generator/userappend.yml +++ /dev/null @@ -1,2 +0,0 @@ -appendix : | - no relevant info From 8219f0d4b31af376f2acf44660b9d102f9b04c0c Mon Sep 17 00:00:00 2001 From: innovation64 Date: Sun, 18 Feb 2024 15:24:37 +0800 Subject: [PATCH 2/3] fix: fix rag drop_out porblem reduce some reduntcen add rerank --- repo_agent/chat_with_repo/gradio_interface.py | 221 ++++++------------ repo_agent/chat_with_repo/json_handler.py | 88 +++---- repo_agent/chat_with_repo/main.py | 5 +- repo_agent/chat_with_repo/prompt.py | 11 +- repo_agent/chat_with_repo/rag.py | 70 ++++-- repo_agent/chat_with_repo/vectordb.py | 50 ++-- 6 files changed, 204 insertions(+), 241 deletions(-) diff --git a/repo_agent/chat_with_repo/gradio_interface.py b/repo_agent/chat_with_repo/gradio_interface.py index c4f991d..7a43ec6 100644 --- a/repo_agent/chat_with_repo/gradio_interface.py +++ b/repo_agent/chat_with_repo/gradio_interface.py @@ -2,173 +2,106 @@ import markdown from repo_agent.log import logger + class GradioInterface: def __init__(self, respond_function): self.respond = respond_function - self.cssa = """ - -
- - """ - self.cssb = """ -
- - - """ self.setup_gradio_interface() def wrapper_respond(self, msg_input, system_input): # 调用原来的 respond 函数 - msg, output1, output2, output3, code ,codex = self.respond(msg_input, system_input) + msg, output1, output2, output3, code = self.respond(msg_input, system_input) output1 = markdown.markdown(str(output1)) output2 = markdown.markdown(str(output2)) code = markdown.markdown(str(code)) output1 = ( - self.cssa - +""" -
Response
-
-
- """ - + str(output1) - +""" -
-
- - """ - ) + "
" + + str(output1) + + "
" + ) output2 = ( - self.cssa - +""" -
Embedding Recall
-
-
- """ - + str(output2) - + self.cssb - ) - code= ( - self.cssa - +""" -
Code
-
-
- """ - + str(code) - +self.cssb - ) + "
" + + str(output2) + + "
" + ) + code = ( + "
" + + str(code) + + "
" + ) - - return msg, output1, output2, output3, code, codex - def clean(self): - msg ="" - output1 =gr.HTML(self.cssa - +""" -
Response
-
-
- - """+self.cssb) - output2 =gr.HTML(self.cssa - +""" -
Embedding Recall
-
-
- - """+self.cssb) - output3 ="" - code =gr.HTML(self.cssa - +""" -
Code
-
-
- - """+self.cssb) - codex = "" - return msg, output1, output2, output3, code, codex + return msg, output1, output2, output3, code def setup_gradio_interface(self): with gr.Blocks() as demo: - gr.Markdown(""" - # RepoAgent: Chat with doc - """) - with gr.Tab("main chat"): + gr.Markdown( + """ + # RepoAgent: Chat with doc + """ + ) + with gr.Row(): + with gr.Column(scale=2): + msg = gr.Textbox(label="Question Input") + btn = gr.Button("Submit") - with gr.Row(): - with gr.Column(): - msg = gr.Textbox(label = "Question Input",lines = 4) - system = gr.Textbox(label = "(Optional)insturction editing", lines = 4) - btn = gr.Button("Submit") - btnc = gr.ClearButton() - btnr = gr.Button("record") - - output1 = gr.HTML(self.cssa - +""" -
Response
-
-
- - """+self.cssb) - with gr.Row(): - with gr.Column(): - # output2 = gr.Textbox(label = "Embedding recall") - output2 = gr.HTML(self.cssa - +""" -
Embedding Recall
-
-
- - """+self.cssb) - code = gr.HTML(self.cssa - +""" -
Code
-
-
- - """+self.cssb) - with gr.Row(): - with gr.Column(): - output3 = gr.Textbox(label = "key words",lines=2) - output4 = gr.Textbox(label = "key words code",lines=14) - - btn.click(self.wrapper_respond, inputs = [msg, system], outputs = [msg, output1, output2, output3, code,output4]) - btnc.click(self.clean,outputs= [msg, output1, output2, output3, code,output4]) - msg.submit(self.wrapper_respond, inputs = [msg, system], outputs = [msg, output1, output2, output3, code,output4]) # Press enter to submit + with gr.Accordion(label="Advanced options", open=False): + system = gr.Textbox( + label="System message", + lines=2, + value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.", + ) + gr.Markdown("## Response") + output1 = gr.HTML( + """ +
+ """ + ) + + with gr.Column(scale=1): + # output2 = gr.Textbox(label = "Embedding recall") + gr.Markdown("## Embedding Recall") + output2 = gr.HTML( + """ +
+ """ + ) + with gr.Column(scale=1): + output3 = gr.Textbox(label="key words") + gr.Markdown("## Code") + code = gr.HTML( + """ +
+ """ + ) + + btn.click( + self.wrapper_respond, + inputs=[msg, system], + outputs=[msg, output1, output2, output3, code], + ) + msg.submit( + self.wrapper_respond, + inputs=[msg, system], + outputs=[msg, output1, output2, output3, code], + ) # Press enter to submit gr.close_all() - demo.queue().launch(share=True,height = 800) + logger.success(f"Starting Gradio Server.", enqueue=True) + demo.queue().launch() + # 使用方法 if __name__ == "__main__": + def respond_function(msg, system): # 这里实现您的响应逻辑 - RAG=""" - - - """ - return msg, RAG, "Embedding_recall_output", "Key_words_output", "Code_output" + return ( + msg, + "RAG_output", + "Embedding_recall_output", + "Key_words_output", + "Code_output", + "QA_output", + ) gradio_interface = GradioInterface(respond_function) diff --git a/repo_agent/chat_with_repo/json_handler.py b/repo_agent/chat_with_repo/json_handler.py index bc8af4c..cfe97aa 100644 --- a/repo_agent/chat_with_repo/json_handler.py +++ b/repo_agent/chat_with_repo/json_handler.py @@ -2,25 +2,25 @@ import sys from repo_agent.log import logger + class JsonFileProcessor: def __init__(self, file_path): self.file_path = file_path def read_json_file(self): - # 读取 JSON 文件作为数据库 - with open(self.file_path, 'r', encoding = 'utf-8') as file: - data = json.load(file) - return data - def extract_md_contents(self): - """ - Extracts the contents of 'md_content' from a JSON file. + try: + with open(self.file_path, "r", encoding="utf-8") as file: + data = json.load(file) + return data + except FileNotFoundError: + logger.exception(f"File not found: {self.file_path}") + sys.exit(1) - Returns: - A list of strings representing the contents of 'md_content'. - """ + def extract_data(self): # Load JSON data from a file json_data = self.read_json_file() md_contents = [] + extracted_contents = [] # Iterate through each file in the JSON data for file, items in json_data.items(): # Check if the value is a list (new format) @@ -31,64 +31,52 @@ def extract_md_contents(self): if "md_content" in item and item["md_content"]: # Append the first element of 'md_content' to the result list md_contents.append(item["md_content"][0]) - return md_contents - - def extract_metadata(self): - """ - Extracts metadata from JSON data. - - Returns: - A list of dictionaries containing the extracted metadata. - """ - # Load JSON data from a file - json_data = self.read_json_file() - extracted_contents = [] - # Iterate through each file in the JSON data - for file_name, items in json_data.items(): - # Check if the value is a list (new format) - if isinstance(items, list): - # Iterate through each item in the list - for item in items: - # Build a dictionary containing the required information - item_dict = { - "type": item.get("type", "UnknownType"), - "name": item.get("name", "Unnamed"), - "code_start_line": item.get("code_start_line", -1), - "code_end_line": item.get("code_end_line", -1), - "have_return": item.get("have_return", False), - "code_content": item.get("code_content", "NoContent"), - "name_column": item.get("name_column", 0), - "item_status": item.get("item_status", "UnknownStatus"), - # Adapt or remove fields based on new structure requirements - } - extracted_contents.append(item_dict) - return extracted_contents + # Build a dictionary containing the required information + item_dict = { + "type": item.get("type", "UnknownType"), + "name": item.get("name", "Unnamed"), + "code_start_line": item.get("code_start_line", -1), + "code_end_line": item.get("code_end_line", -1), + "have_return": item.get("have_return", False), + "code_content": item.get("code_content", "NoContent"), + "name_column": item.get("name_column", 0), + "item_status": item.get("item_status", "UnknownStatus"), + # Adapt or remove fields based on new structure requirements + } + extracted_contents.append(item_dict) + return md_contents,extracted_contents - def recursive_search(self, data_item, search_text, results): + def recursive_search(self, data_item, search_text, code_results, md_results): if isinstance(data_item, dict): # Direct comparison is removed as there's no direct key==search_text in the new format for key, value in data_item.items(): # Recursively search through dictionary values and lists if isinstance(value, (dict, list)): - self.recursive_search(value, search_text, results) + self.recursive_search(value, search_text,code_results, md_results) elif isinstance(data_item, list): for item in data_item: # Now we check for the 'name' key in each item of the list if isinstance(item, dict) and item.get('name') == search_text: # If 'code_content' exists, append it to results if 'code_content' in item: - results.append(item['code_content']) + code_results.append(item['code_content']) + md_results.append(item['md_content']) # Recursive call in case of nested lists or dicts - self.recursive_search(item, search_text, results) + self.recursive_search(item, search_text, code_results, md_results) def search_code_contents_by_name(self, file_path, search_text): # Attempt to retrieve code from the JSON file try: with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) - results = [] # List to store matching items' code_content - self.recursive_search(data, search_text, results) - return results if results else "No matching item found." + code_results = [] + md_results = [] # List to store matching items' code_content and md_content + self.recursive_search(data, search_text, code_results, md_results) + # 确保无论结果如何都返回两个值 + if code_results or md_results: + return code_results, md_results + else: + return ["No matching item found."], ["No matching item found."] except FileNotFoundError: return "File not found." except json.JSONDecodeError: @@ -99,4 +87,4 @@ def search_code_contents_by_name(self, file_path, search_text): if __name__ == "__main__": processor = JsonFileProcessor("database.json") - md_contents = processor.extract_md_contents() \ No newline at end of file + md_contents = processor.extract_md_contents() diff --git a/repo_agent/chat_with_repo/main.py b/repo_agent/chat_with_repo/main.py index 39c13fd..abdfb7b 100644 --- a/repo_agent/chat_with_repo/main.py +++ b/repo_agent/chat_with_repo/main.py @@ -9,12 +9,11 @@ def main(): api_key = CONFIG["api_keys"][_model][0]["api_key"] api_base = CONFIG["api_keys"][_model][0]["base_url"] db_path = os.path.join( - CONFIG["repo_path"], CONFIG["project_hierarchy"], ".project_hierarchy.json" + CONFIG["repo_path"], CONFIG["project_hierarchy"], "project_hierarchy.json" ) assistant = RepoAssistant(api_key, api_base, db_path) - md_contents = assistant.json_data.extract_md_contents() - meta_data = assistant.json_data.extract_metadata() + md_contents,meta_data = assistant.json_data.extract_data() assistant.chroma_data.create_vector_store(md_contents, meta_data) GradioInterface(assistant.respond) diff --git a/repo_agent/chat_with_repo/prompt.py b/repo_agent/chat_with_repo/prompt.py index 66c7eb8..16636a9 100644 --- a/repo_agent/chat_with_repo/prompt.py +++ b/repo_agent/chat_with_repo/prompt.py @@ -2,7 +2,6 @@ from repo_agent.log import logger from repo_agent.chat_with_repo.json_handler import JsonFileProcessor -# logger.add("./log.txt", level="DEBUG", format="{time} - {name} - {level} - {message}") class TextAnalysisTool: def __init__(self, llm, db_path): @@ -25,8 +24,8 @@ def format_chat_prompt(self, message, instruction): return prompt def queryblock(self, message): - search_result = self.jsonsearch.search_code_contents_by_name(self.db_path, message) - return search_result + search_result,md = self.jsonsearch.search_code_contents_by_name(self.db_path, message) + return search_result,md def list_to_markdown(self, search_result): markdown_str = "" @@ -38,14 +37,14 @@ def list_to_markdown(self, search_result): return markdown_str def nerquery(self, message): - instruction = """ + query1 = """ The output must strictly be a pure function name or class name, without any additional characters. For example: Pure function names: calculateSum, processData Pure class names: MyClass, DataProcessor The output function name or class name should be only one. """ - query = f"{instruction}\nExtract the most relevant class or function from the following input:\n{message}\nOutput:" + query = f"Extract the most relevant class or function base following instrcution {query1},here is input:\n{message}\nOutput:" response = self.llm.complete(query) # logger.debug(f"Input: {message}, Output: {response}") return response @@ -56,4 +55,4 @@ def nerquery(self, message): log_file = "your_logfile_path" llm = OpenAI(api_key=api_key, api_base=api_base) db_path = "your_database_path" - test = TextAnalysisTool(llm, db_path) \ No newline at end of file + test = TextAnalysisTool(llm, db_path) diff --git a/repo_agent/chat_with_repo/rag.py b/repo_agent/chat_with_repo/rag.py index 32ae9a8..a8d109d 100644 --- a/repo_agent/chat_with_repo/rag.py +++ b/repo_agent/chat_with_repo/rag.py @@ -4,8 +4,9 @@ from repo_agent.log import logger from llama_index import PromptTemplate from llama_index.llms import OpenAI +import json +from openai import OpenAI as AI -# logger.add("./log.txt", level="DEBUG", format="{time} - {name} - {level} - {message}") class RepoAssistant: def __init__(self, api_key, api_base, db_path): @@ -16,6 +17,7 @@ def __init__(self, api_key, api_base, db_path): self.md_contents = [] self.llm = OpenAI(api_key=api_key, api_base=api_base,model="gpt-3.5-turbo-1106") self.client = OpenAI(api_key=api_key, api_base=api_base,model="gpt-4-1106-preview") + self.lm = AI(api_key = api_key, base_url = api_base) self.textanslys = TextAnalysisTool(self.llm,db_path) self.json_data = JsonFileProcessor(db_path) self.chroma_data = ChromaManager(api_key, api_base) @@ -37,10 +39,26 @@ def generate_queries(self, query_str: str, num_queries: int = 4): queries = response.text.split("\n") return queries + def rerank(self, query ,docs): + response = self.lm.chat.completions.create( + model='gpt-4-1106-preview', + response_format={"type": "json_object"}, + temperature=0, + messages=[ + {"role": "system", "content": "You are an expert relevance ranker. Given a list of documents and a query, your job is to determine how relevant each document is for answering the query. Your output is JSON, which is a list of documents. Each document has two fields, content and score. relevance_score is from 0.0 to 100.0. Higher relevance means higher score."}, + {"role": "user", "content": f"Query: {query} Docs: {docs}"} + ] + ) + scores = json.loads(response.choices[0].message.content)["documents"] + logger.debug(f"scores: {scores}") + sorted_data = sorted(scores, key=lambda x: x['relevance_score'], reverse=True) + top_5_contents = [doc['content'] for doc in sorted_data[:5]] + return top_5_contents + def rag(self, query, retrieved_documents): # rag information = "\n\n".join(retrieved_documents) - messages = f"You are a helpful expert repo research assistant. Your users are asking questions about information contained in a repository. You will be shown the user's question, and the relevant information from the repository. Answer the user's question using only the information given.\nQuestion: {query}. \nInformation: {information}" + messages = f"You are a helpful expert repo research assistant. Your users are asking questions about information contained in repo . You will be shown the user's question, and the relevant information from the repo. Answer the user's question using only this information.\nQuestion: {query}. \nInformation: {information}" response = self.llm.complete(messages) content = response return content @@ -49,7 +67,7 @@ def list_to_markdown(self,list_items): # 对于列表中的每个项目,添加一个带数字的列表项 for index, item in enumerate(list_items, start=1): - markdown_content += f"[{index}] {item}\n" + markdown_content += f"{index}. {item}\n" return markdown_content def rag_ar(self, query, related_code, embedding_recall, project_name): @@ -83,7 +101,7 @@ def respond(self, message, instruction): # return answer prompt = self.textanslys.format_chat_prompt(message, instruction) questions = self.textanslys.keyword(prompt) - logger.debug(f"Questions: {questions}") + # logger.debug(f"Questions: {questions}") promptq = self.generate_queries(prompt,3) all_results = [] all_ids = [] @@ -93,7 +111,8 @@ def respond(self, message, instruction): all_ids.extend(query_result['ids'][0]) logger.debug(f"all_ids: {all_ids},{all_results}") - unique_ids = [id for id in all_ids if all_ids.count(id) == 1] + unique_ids = list(dict.fromkeys(all_ids)) + # unique_ids = [id for id in all_ids if all_ids.count(id) == 1] logger.debug(f"uniqueid: {unique_ids}") unique_documents = [] unique_code = [] @@ -102,30 +121,49 @@ def respond(self, message, instruction): if id in unique_ids: unique_documents.append(doc) unique_code.append(code.get("code_content")) - unique_code=self.textanslys.list_to_markdown(unique_code) - retrieved_documents = unique_documents + + retrieved_documents = self.rerank(message,unique_documents) # logger.debug(f"retrieveddocuments: {retrieved_documents}") response = self.rag(prompt,retrieved_documents) chunkrecall = self.list_to_markdown(retrieved_documents) bot_message = str(response) keyword = str(self.textanslys.nerquery(bot_message)) keywords = str(self.textanslys.nerquery(str(prompt)+str(questions))) - codez=self.textanslys.queryblock(keyword) - codey=self.textanslys.queryblock(keywords) + codez,mdz=self.textanslys.queryblock(keyword) + codey,mdy=self.textanslys.queryblock(keywords) if not isinstance(codez, list): - codex = [codez] + codez = [codez] + if not isinstance(mdz, list): + mdz = [mdz] # 确保 codey 是列表,如果不是,则将其转换为列表 if not isinstance(codey, list): codey = [codey] + if not isinstance(mdy, list): + mdy = [mdy] + codex = codez+codey - codex = self.textanslys.list_to_markdown(codex) - bot_message = self.rag_ar(prompt,unique_code,retrieved_documents,"test") - bot_message = str(bot_message) +'\n'+ str(self.textanslys.tree(bot_message)) - return message, bot_message,chunkrecall,questions,unique_code,codex - + md = mdz + mdy + unique_mdx = list(set([item for sublist in md for item in sublist])) + uni_codex = [] + uni_md = [] + uni_codex = list(dict.fromkeys(codex)) + uni_md = list(dict.fromkeys(unique_mdx)) + codex = self.textanslys.list_to_markdown(uni_codex) + retrieved_documents = retrieved_documents+uni_md + retrieved_documents = list(dict.fromkeys(retrieved_documents)) + retrieved_documents = self.rerank(message,retrieved_documents[:6]) + uni_code = uni_codex+unique_code + uni_code = list(dict.fromkeys(uni_code)) + uni_code = self.rerank(message,uni_code[:6]) + unique_code=self.textanslys.list_to_markdown(unique_code) + bot_message = self.rag_ar(prompt,uni_code,retrieved_documents,"test") + bot_message = str(bot_message) + return message, bot_message, chunkrecall, questions, unique_code, codex + + if __name__ == "__main__": api_key = "" api_base = "" db_path = "" log_file = "" - assistant = RepoAssistant(api_key, api_base, db_path, log_file) \ No newline at end of file + assistant = RepoAssistant(api_key, api_base, db_path, log_file) diff --git a/repo_agent/chat_with_repo/vectordb.py b/repo_agent/chat_with_repo/vectordb.py index f6558b7..f7b2e22 100644 --- a/repo_agent/chat_with_repo/vectordb.py +++ b/repo_agent/chat_with_repo/vectordb.py @@ -2,31 +2,33 @@ from chromadb.utils import embedding_functions from repo_agent.log import logger -logger.add("./log.txt", level="DEBUG", format="{time} - {name} - {level} - {message}") + class ChromaManager: def __init__(self, api_key, api_base): self.api_key = api_key self.api_base = api_base self.chroma_collection = None - self.is_new_collection = False + self.is_new_collection = False self.init_chroma_collection() def init_chroma_collection(self): - - chroma_client = chromadb.PersistentClient(path="./chroma_db") + chroma_client = chromadb.PersistentClient(path=".chroma_db") # 获取所有集合的列表 existing_collections = chroma_client.list_collections() - # logger.debug(f"Questions: {existing_collections}") + logger.debug(f"Questions: {existing_collections}") # 检查 "test" 集合是否存在 if "test" in existing_collections: # 存在则加载集合 - self.chroma_collection = chroma_client.get_collection("test",embedding_function=embedding_functions.OpenAIEmbeddingFunction( - api_key=self.api_key, - api_base=self.api_base, - model_name="text-embedding-3-large" - )) + self.chroma_collection = chroma_client.get_collection( + "test", + embedding_function=embedding_functions.OpenAIEmbeddingFunction( + api_key=self.api_key, + api_base=self.api_base, + model_name="text-embedding-ada-002", + ), + ) self.is_new_collection = False else: # 不存在则创建集合 @@ -36,29 +38,33 @@ def init_chroma_collection(self): embedding_function=embedding_functions.OpenAIEmbeddingFunction( api_key=self.api_key, api_base=self.api_base, - model_name="text-embedding-3-large" - ) + model_name="text-embedding-ada-002", + ), ) self.is_new_collection = True except chromadb.db.base.UniqueConstraintError: # 如果尝试创建时出现错误,说明集合已存在 - self.chroma_collection = chroma_client.get_collection("test",embedding_function=embedding_functions.OpenAIEmbeddingFunction( + self.chroma_collection = chroma_client.get_collection( + "test", + embedding_function=embedding_functions.OpenAIEmbeddingFunction( api_key=self.api_key, api_base=self.api_base, - model_name="text-embedding-3-large" - )) + model_name="text-embedding-ada-002", + ), + ) self.is_new_collection = False - - def create_vector_store(self, md_contents,meta_data): + def create_vector_store(self, md_contents, meta_data): # Process Markdown content and store it in Chroma if self.is_new_collection: # 仅当是新集合时执行 - # logger.debug(f"judge: {self.is_new_collection}") - ids = [str(i) for i in range(len(md_contents))] - self.chroma_collection.add(ids = ids, documents = md_contents,metadatas = meta_data) + # 确保 ids 的长度与 md_contents 和 meta_data 中较短的一方相匹配 + min_length = min(len(md_contents), len(meta_data)) + ids = [str(i) for i in range(min_length)] + # 只使用相应长度的 md_contents 和 meta_data + self.chroma_collection.add(ids=ids, documents=md_contents[:min_length], metadatas=meta_data[:min_length]) else: logger.debug(f"judge: {self.is_new_collection}") + if __name__ == "__main__": - test = ChromaManager(api_key = "", api_base = "") - \ No newline at end of file + test = ChromaManager(api_key="", api_base="") From b1921b708ea2218b3c59a18fd37e2dcc62f3b932 Mon Sep 17 00:00:00 2001 From: innovation64 Date: Sun, 18 Feb 2024 15:35:54 +0800 Subject: [PATCH 3/3] fix: fix rag drop_out porblem reduce some reduntcen add rerank --- repo_agent/chat_with_repo/gradio_interface.py | 222 ++++++++++++------ repo_agent/chat_with_repo/vectordb.py | 32 ++- 2 files changed, 159 insertions(+), 95 deletions(-) diff --git a/repo_agent/chat_with_repo/gradio_interface.py b/repo_agent/chat_with_repo/gradio_interface.py index 7a43ec6..33ee630 100644 --- a/repo_agent/chat_with_repo/gradio_interface.py +++ b/repo_agent/chat_with_repo/gradio_interface.py @@ -6,102 +6,170 @@ class GradioInterface: def __init__(self, respond_function): self.respond = respond_function + self.cssa = """ + +
+ + """ + self.cssb = """ +
+
+
+ """ self.setup_gradio_interface() def wrapper_respond(self, msg_input, system_input): # 调用原来的 respond 函数 - msg, output1, output2, output3, code = self.respond(msg_input, system_input) + msg, output1, output2, output3, code ,codex = self.respond(msg_input, system_input) output1 = markdown.markdown(str(output1)) output2 = markdown.markdown(str(output2)) code = markdown.markdown(str(code)) output1 = ( - "
" - + str(output1) - + "
" - ) + self.cssa + +""" +
Response
+
+
+ """ + + str(output1) + +""" +
+
+
+ """ + ) output2 = ( - "
" - + str(output2) - + "
" - ) - code = ( - "
" - + str(code) - + "
" - ) - - return msg, output1, output2, output3, code - - def setup_gradio_interface(self): - with gr.Blocks() as demo: - gr.Markdown( + self.cssa + +""" +
Embedding Recall
+
+
+ """ + + str(output2) + + self.cssb + ) + code= ( + self.cssa + +""" +
Code
+
+
""" - # RepoAgent: Chat with doc - """ + + str(code) + +self.cssb ) - with gr.Row(): - with gr.Column(scale=2): - msg = gr.Textbox(label="Question Input") - btn = gr.Button("Submit") - with gr.Accordion(label="Advanced options", open=False): - system = gr.Textbox( - label="System message", - lines=2, - value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.", - ) - gr.Markdown("## Response") - output1 = gr.HTML( - """ -
- """ - ) + + return msg, output1, output2, output3, code, codex + def clean(self): + msg ="" + output1 =gr.HTML(self.cssa + +""" +
Response
+
+
+ + """+self.cssb) + output2 =gr.HTML(self.cssa + +""" +
Embedding Recall
+
+
+ + """+self.cssb) + output3 ="" + code =gr.HTML(self.cssa + +""" +
Code
+
+
+ + """+self.cssb) + codex = "" + return msg, output1, output2, output3, code, codex - with gr.Column(scale=1): - # output2 = gr.Textbox(label = "Embedding recall") - gr.Markdown("## Embedding Recall") - output2 = gr.HTML( - """ -
- """ - ) - with gr.Column(scale=1): - output3 = gr.Textbox(label="key words") - gr.Markdown("## Code") - code = gr.HTML( - """ -
- """ - ) + def setup_gradio_interface(self): + with gr.Blocks() as demo: + gr.Markdown(""" + # RepoAgent: Chat with doc + """) + with gr.Tab("main chat"): - btn.click( - self.wrapper_respond, - inputs=[msg, system], - outputs=[msg, output1, output2, output3, code], - ) - msg.submit( - self.wrapper_respond, - inputs=[msg, system], - outputs=[msg, output1, output2, output3, code], - ) # Press enter to submit + with gr.Row(): + with gr.Column(): + msg = gr.Textbox(label = "Question Input",lines = 4) + system = gr.Textbox(label = "(Optional)insturction editing", lines = 4) + btn = gr.Button("Submit") + btnc = gr.ClearButton() + btnr = gr.Button("record") + + output1 = gr.HTML(self.cssa + +""" +
Response
+
+
+ + """+self.cssb) + with gr.Row(): + with gr.Column(): + # output2 = gr.Textbox(label = "Embedding recall") + output2 = gr.HTML(self.cssa + +""" +
Embedding Recall
+
+
+ + """+self.cssb) + code = gr.HTML(self.cssa + +""" +
Code
+
+
+ + """+self.cssb) + with gr.Row(): + with gr.Column(): + output3 = gr.Textbox(label = "key words",lines=2) + output4 = gr.Textbox(label = "key words code",lines=14) + + btn.click(self.wrapper_respond, inputs = [msg, system], outputs = [msg, output1, output2, output3, code,output4]) + btnc.click(self.clean,outputs= [msg, output1, output2, output3, code,output4]) + msg.submit(self.wrapper_respond, inputs = [msg, system], outputs = [msg, output1, output2, output3, code,output4]) # Press enter to submit gr.close_all() - logger.success(f"Starting Gradio Server.", enqueue=True) - demo.queue().launch() - + demo.queue().launch(share=True,height = 800) # 使用方法 if __name__ == "__main__": - def respond_function(msg, system): # 这里实现您的响应逻辑 - return ( - msg, - "RAG_output", - "Embedding_recall_output", - "Key_words_output", - "Code_output", - "QA_output", - ) + RAG=""" + + + """ + return msg, RAG, "Embedding_recall_output", "Key_words_output", "Code_output" - gradio_interface = GradioInterface(respond_function) + gradio_interface = GradioInterface(respond_function) \ No newline at end of file diff --git a/repo_agent/chat_with_repo/vectordb.py b/repo_agent/chat_with_repo/vectordb.py index f7b2e22..a942750 100644 --- a/repo_agent/chat_with_repo/vectordb.py +++ b/repo_agent/chat_with_repo/vectordb.py @@ -8,11 +8,12 @@ def __init__(self, api_key, api_base): self.api_key = api_key self.api_base = api_base self.chroma_collection = None - self.is_new_collection = False + self.is_new_collection = False self.init_chroma_collection() def init_chroma_collection(self): - chroma_client = chromadb.PersistentClient(path=".chroma_db") + + chroma_client = chromadb.PersistentClient(path="./chroma_db") # 获取所有集合的列表 existing_collections = chroma_client.list_collections() @@ -21,14 +22,11 @@ def init_chroma_collection(self): # 检查 "test" 集合是否存在 if "test" in existing_collections: # 存在则加载集合 - self.chroma_collection = chroma_client.get_collection( - "test", - embedding_function=embedding_functions.OpenAIEmbeddingFunction( - api_key=self.api_key, - api_base=self.api_base, - model_name="text-embedding-ada-002", - ), - ) + self.chroma_collection = chroma_client.get_collection("test",embedding_function=embedding_functions.OpenAIEmbeddingFunction( + api_key=self.api_key, + api_base=self.api_base, + model_name="text-embedding-3-small" + )) self.is_new_collection = False else: # 不存在则创建集合 @@ -38,20 +36,17 @@ def init_chroma_collection(self): embedding_function=embedding_functions.OpenAIEmbeddingFunction( api_key=self.api_key, api_base=self.api_base, - model_name="text-embedding-ada-002", - ), + model_name="text-embedding-3-small" + ) ) self.is_new_collection = True except chromadb.db.base.UniqueConstraintError: # 如果尝试创建时出现错误,说明集合已存在 - self.chroma_collection = chroma_client.get_collection( - "test", - embedding_function=embedding_functions.OpenAIEmbeddingFunction( + self.chroma_collection = chroma_client.get_collection("test",embedding_function=embedding_functions.OpenAIEmbeddingFunction( api_key=self.api_key, api_base=self.api_base, - model_name="text-embedding-ada-002", - ), - ) + model_name="text-embedding-3-small" + )) self.is_new_collection = False def create_vector_store(self, md_contents, meta_data): @@ -66,5 +61,6 @@ def create_vector_store(self, md_contents, meta_data): logger.debug(f"judge: {self.is_new_collection}") + if __name__ == "__main__": test = ChromaManager(api_key="", api_base="")