From dea3bc6195f32e53c39ceffdb56ef5b5a2285652 Mon Sep 17 00:00:00 2001 From: mannaandpoem <1580466765@qq.com> Date: Sat, 7 Sep 2024 18:08:48 +0800 Subject: [PATCH 1/2] Remove logic to skip the first 54 bug instances in swe_bench_data --- agentless/fl/localize.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/agentless/fl/localize.py b/agentless/fl/localize.py index af7081b..148343c 100644 --- a/agentless/fl/localize.py +++ b/agentless/fl/localize.py @@ -105,12 +105,8 @@ def localize(args): if args.start_file: start_file_locs = load_jsonl(args.start_file) - count = 0 - for bug in swe_bench_data: - if count <= 54: - count += 1 - continue + for bug in swe_bench_data: if args.target_id is not None: if args.target_id != bug["instance_id"]: continue @@ -262,7 +258,6 @@ def localize(args): ) + "\n" ) - count += 1 def merge(args): From f05106a152279eb4660b2a7d8ffaa04718c6fd4c Mon Sep 17 00:00:00 2001 From: mannaandpoem <1580466765@qq.com> Date: Sat, 7 Sep 2024 18:15:56 +0800 Subject: [PATCH 2/2] Refactor: Move retrieve_graph and construct_code_graph_context functions from localize.py and repair.py to get_repo_structure.py for code reuse --- agentless/fl/localize.py | 77 +----------------- .../get_repo_structure/get_repo_structure.py | 80 +++++++++++++++++++ agentless/repair/repair.py | 72 +---------------- 3 files changed, 82 insertions(+), 147 deletions(-) diff --git a/agentless/fl/localize.py b/agentless/fl/localize.py index 148343c..17d4f6d 100644 --- a/agentless/fl/localize.py +++ b/agentless/fl/localize.py @@ -17,87 +17,12 @@ from agentless.util.utils import load_json, load_jsonl from agentless.get_repo_structure.get_repo_structure import ( clone_repo, - get_project_structure_from_scratch, + get_project_structure_from_scratch, construct_code_graph_context, ) # PROJECT_FILE_LOC = os.environ.get("PROJECT_FILE_LOC", None) PROJECT_FILE_LOC = "./repo_structures" -def retrieve_graph(code_graph, graph_tags, search_term, structure, max_tags=100): - one_hop_tags = [] - tags = [] - for tag in graph_tags: - if tag['name'] == search_term and tag['kind'] == 'ref': - tags.append(tag) - if len(tags) >= max_tags: - break - # for tag in tags: - for i, tag in enumerate(tags): - # if i % 3 == 0: - print(f"Retrieving graph for {i}/{len(tags)}") - # find corresponding calling function/class - path = tag['rel_fname'].split('/') - s = deepcopy(structure) # stuck here - for p in path: - s = s[p] - for txt in s['functions']: - if tag['line'] >= txt['start_line'] and tag['line'] <= txt['end_line']: - one_hop_tags.append((txt, tag['rel_fname'])) - for txt in s['classes']: - for func in txt['methods']: - if tag['line'] >= func['start_line'] and tag['line'] <= func['end_line']: - func['text'].insert(0, txt['text'][0]) - one_hop_tags.append((func, tag['rel_fname'])) - return one_hop_tags - -def construct_code_graph_context(found_related_locs, code_graph, graph_tags, structure): - graph_context = "" - - graph_item_format = """ -### Dependencies for {func} -{dependencies} -""" - tag_format = """ -location: {fname} lines {start_line} - {end_line} -name: {name} -contents: -{contents} - -""" - # retrieve the code graph for dependent functions and classes - for item in found_related_locs: - code_graph_context = "" - item = item[0].splitlines() - for loc in tqdm(item): - if loc.startswith("class: ") and "." not in loc: - loc = loc[len("class: ") :].strip() - tags = retrieve_graph(code_graph, graph_tags, loc, structure) - for t, fname in tags: - code_graph_context += tag_format.format( - **t, - fname=fname, - contents="\n".join(t['text']) - ) - elif loc.startswith("function: ") and "." not in loc: - loc = loc[len("function: ") :].strip() - tags = retrieve_graph(code_graph, graph_tags, loc, structure) - for t, fname in tags: - code_graph_context += tag_format.format( - **t, - fname=fname, - contents="\n".join(t['text']) - ) - elif "." in loc: - loc = loc.split(".")[-1].strip() - tags = retrieve_graph(code_graph, graph_tags, loc, structure) - for t, fname in tags: - code_graph_context += tag_format.format( - **t, - fname=fname, - contents="\n".join(t['text']) - ) - graph_context += graph_item_format.format(func=loc, dependencies=code_graph_context) - return graph_context def localize(args): diff --git a/agentless/get_repo_structure/get_repo_structure.py b/agentless/get_repo_structure/get_repo_structure.py index 065d85e..c226879 100644 --- a/agentless/get_repo_structure/get_repo_structure.py +++ b/agentless/get_repo_structure/get_repo_structure.py @@ -4,6 +4,8 @@ import os import subprocess import uuid + +from copy import deepcopy from datasets import load_dataset import pandas as pd from tqdm import tqdm @@ -235,6 +237,84 @@ def create_structure(directory_path): return structure + +def retrieve_graph(code_graph, graph_tags, search_term, structure, max_tags=100): + one_hop_tags = [] + tags = [] + for tag in graph_tags: + if tag['name'] == search_term and tag['kind'] == 'ref': + tags.append(tag) + if len(tags) >= max_tags: + break + # for tag in tags: + for i, tag in enumerate(tags): + # if i % 3 == 0: + print(f"Retrieving graph for {i}/{len(tags)}") + # find corresponding calling function/class + path = tag['rel_fname'].split('/') + s = deepcopy(structure) # stuck here + for p in path: + s = s[p] + for txt in s['functions']: + if tag['line'] >= txt['start_line'] and tag['line'] <= txt['end_line']: + one_hop_tags.append((txt, tag['rel_fname'])) + for txt in s['classes']: + for func in txt['methods']: + if tag['line'] >= func['start_line'] and tag['line'] <= func['end_line']: + func['text'].insert(0, txt['text'][0]) + one_hop_tags.append((func, tag['rel_fname'])) + return one_hop_tags + +def construct_code_graph_context(found_related_locs, code_graph, graph_tags, structure): + graph_context = "" + + graph_item_format = """ +### Dependencies for {func} +{dependencies} +""" + tag_format = """ +location: {fname} lines {start_line} - {end_line} +name: {name} +contents: +{contents} + +""" + # retrieve the code graph for dependent functions and classes + for item in found_related_locs: + code_graph_context = "" + item = item[0].splitlines() + for loc in tqdm(item): + if loc.startswith("class: ") and "." not in loc: + loc = loc[len("class: ") :].strip() + tags = retrieve_graph(code_graph, graph_tags, loc, structure) + for t, fname in tags: + code_graph_context += tag_format.format( + **t, + fname=fname, + contents="\n".join(t['text']) + ) + elif loc.startswith("function: ") and "." not in loc: + loc = loc[len("function: ") :].strip() + tags = retrieve_graph(code_graph, graph_tags, loc, structure) + for t, fname in tags: + code_graph_context += tag_format.format( + **t, + fname=fname, + contents="\n".join(t['text']) + ) + elif "." in loc: + loc = loc.split(".")[-1].strip() + tags = retrieve_graph(code_graph, graph_tags, loc, structure) + for t, fname in tags: + code_graph_context += tag_format.format( + **t, + fname=fname, + contents="\n".join(t['text']) + ) + graph_context += graph_item_format.format(func=loc, dependencies=code_graph_context) + return graph_context + + if __name__ == '__main__': structure = create_structure('playground/astropy') diff --git a/agentless/repair/repair.py b/agentless/repair/repair.py index 20f7fd9..267a60c 100644 --- a/agentless/repair/repair.py +++ b/agentless/repair/repair.py @@ -10,6 +10,7 @@ from datasets import load_dataset from tqdm import tqdm +from agentless.get_repo_structure.get_repo_structure import construct_code_graph_context from agentless.util.api_requests import ( create_chatgpt_config, num_tokens_from_messages, @@ -192,77 +193,6 @@ Wrap the *SEARCH/REPLACE* edit in blocks ```python...```. """ -def retrieve_graph(code_graph, graph_tags, search_term, structure): - one_hop_tags = [] - tags = [] - for tag in graph_tags: - if tag['name'] == search_term and tag['kind'] == 'ref': - tags.append(tag) - for tag in tags: - # find corresponding calling function/class - path = tag['rel_fname'].split('/') - s = deepcopy(structure) - for p in path: - s = s[p] - for txt in s['functions']: - if tag['line'] >= txt['start_line'] and tag['line'] <= txt['end_line']: - one_hop_tags.append((txt, tag['rel_fname'])) - for txt in s['classes']: - for func in txt['methods']: - if tag['line'] >= func['start_line'] and tag['line'] <= func['end_line']: - func['text'].insert(0, txt['text'][0]) - one_hop_tags.append((func, tag['rel_fname'])) - return one_hop_tags - -def construct_code_graph_context(found_related_locs, code_graph, graph_tags, structure): - graph_context = "" - - graph_item_format = """ -### Dependencies for {func} -{dependencies} -""" - tag_format = """ -location: {fname} lines {start_line} - {end_line} -name: {name} -contents: -{contents} - -""" - # retrieve the code graph for dependent functions and classes - for item in found_related_locs: - code_graph_context = "" - item = item[0].splitlines() - for loc in item: - if loc.startswith("class: ") and "." not in loc: - loc = loc[len("class: ") :].strip() - tags = retrieve_graph(code_graph, graph_tags, loc, structure) - for t, fname in tags: - code_graph_context += tag_format.format( - **t, - fname=fname, - contents="\n".join(t['text']) - ) - elif loc.startswith("function: ") and "." not in loc: - loc = loc[len("function: ") :].strip() - tags = retrieve_graph(code_graph, graph_tags, loc, structure) - for t, fname in tags: - code_graph_context += tag_format.format( - **t, - fname=fname, - contents="\n".join(t['text']) - ) - elif "." in loc: - loc = loc.split(".")[-1].strip() - tags = retrieve_graph(code_graph, graph_tags, loc, structure) - for t, fname in tags: - code_graph_context += tag_format.format( - **t, - fname=fname, - contents="\n".join(t['text']) - ) - graph_context += graph_item_format.format(func=loc, dependencies=code_graph_context) - return graph_context - def _post_process_multifile_repair(