From fb9335f4247514ec15c3ee1a4ce3a8200d4bf28b Mon Sep 17 00:00:00 2001 From: mrT23 Date: Mon, 21 Aug 2023 09:07:21 +0300 Subject: [PATCH 1/5] extended improve --- pr_agent/agent/pr_agent.py | 2 + pr_agent/algo/ai_handler.py | 2 +- pr_agent/algo/git_patch_processing.py | 2 +- pr_agent/algo/pr_processing.py | 100 +++++++- pr_agent/config_loader.py | 1 + pr_agent/settings/configuration.toml | 8 + .../settings/pr_code_suggestions_prompts.toml | 67 +++--- pr_agent/settings/pr_reviewer_prompts.toml | 4 +- .../pr_sort_code_suggestions_prompts.toml | 46 ++++ .../tools/pr_extended_code_suggestions.py | 215 ++++++++++++++++++ 10 files changed, 406 insertions(+), 41 deletions(-) create mode 100644 pr_agent/settings/pr_sort_code_suggestions_prompts.toml create mode 100644 pr_agent/tools/pr_extended_code_suggestions.py diff --git a/pr_agent/agent/pr_agent.py b/pr_agent/agent/pr_agent.py index 70121f3ca..f722695c0 100644 --- a/pr_agent/agent/pr_agent.py +++ b/pr_agent/agent/pr_agent.py @@ -11,6 +11,7 @@ from pr_agent.tools.pr_information_from_user import PRInformationFromUser from pr_agent.tools.pr_questions import PRQuestions from pr_agent.tools.pr_reviewer import PRReviewer +from pr_agent.tools.pr_extended_code_suggestions import PRExtendedCodeSuggestions from pr_agent.tools.pr_update_changelog import PRUpdateChangelog from pr_agent.tools.pr_config import PRConfig @@ -25,6 +26,7 @@ "describe_pr": PRDescription, "improve": PRCodeSuggestions, "improve_code": PRCodeSuggestions, + "extended_improve": PRExtendedCodeSuggestions, "ask": PRQuestions, "ask_question": PRQuestions, "update_changelog": PRUpdateChangelog, diff --git a/pr_agent/algo/ai_handler.py b/pr_agent/algo/ai_handler.py index fb5f64fe9..0bc879d1e 100644 --- a/pr_agent/algo/ai_handler.py +++ b/pr_agent/algo/ai_handler.py @@ -55,7 +55,7 @@ def deployment_id(self): @retry(exceptions=(APIError, Timeout, TryAgain, AttributeError, RateLimitError), tries=OPENAI_RETRIES, delay=2, backoff=2, jitter=(1, 3)) - async def chat_completion(self, model: str, temperature: float, system: str, user: str): + async def chat_completion(self, model: str, system: str, user: str, temperature: float = 0.2): """ Performs a chat completion using the OpenAI ChatCompletion API. Retries in case of API errors or timeouts. diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py index 1aec00066..230df41e4 100644 --- a/pr_agent/algo/git_patch_processing.py +++ b/pr_agent/algo/git_patch_processing.py @@ -176,7 +176,7 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: ... """ - patch_with_lines_str = f"## {file.filename}\n" + patch_with_lines_str = f"\n\n## {file.filename}\n" import re patch_lines = patch.splitlines() RE_HUNK_HEADER = re.compile( diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index adab95063..1003f4566 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -57,7 +57,7 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) # generate a standard diff string, with patch extension - patches_extended, total_tokens = pr_generate_extended_diff(pr_languages, token_handler, + patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff(pr_languages, token_handler, add_line_numbers_to_hunks) # if we are under the limit, return the full diff @@ -78,9 +78,9 @@ def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: s return final_diff -def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler, - add_line_numbers_to_hunks: bool) -> \ - Tuple[list, int]: +def pr_generate_extended_diff(pr_languages: list, + token_handler: TokenHandler, + add_line_numbers_to_hunks: bool) -> Tuple[list, int, list]: """ Generate a standard diff string with patch extension, while counting the number of tokens used and applying diff minimization techniques if needed. @@ -90,13 +90,10 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler, files. - token_handler: An object of the TokenHandler class used for handling tokens in the context of the pull request. - add_line_numbers_to_hunks: A boolean indicating whether to add line numbers to the hunks in the diff. - - Returns: - - patches_extended: A list of extended patches for each file in the pull request. - - total_tokens: The total number of tokens used in the extended patches. """ total_tokens = token_handler.prompt_tokens # initial tokens patches_extended = [] + patches_extended_tokens = [] for lang in pr_languages: for file in lang['files']: original_file_content_str = file.base_file @@ -108,15 +105,16 @@ def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler, extended_patch = extend_patch(original_file_content_str, patch, num_lines=PATCH_EXTRA_LINES) full_extended_patch = f"## {file.filename}\n\n{extended_patch}\n" - if add_line_numbers_to_hunks: + if add_line_numbers_to_hunks and PATCH_EXTRA_LINES > 0: full_extended_patch = convert_to_hunks_with_lines_numbers(extended_patch, file) patch_tokens = token_handler.count_tokens(full_extended_patch) file.tokens = patch_tokens total_tokens += patch_tokens + patches_extended_tokens.append(patch_tokens) patches_extended.append(full_extended_patch) - return patches_extended, total_tokens + return patches_extended, total_tokens, patches_extended_tokens def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, model: str, @@ -337,4 +335,84 @@ def clip_tokens(text: str, max_tokens: int) -> str: return clipped_text except Exception as e: logging.warning(f"Failed to clip tokens: {e}") - return text \ No newline at end of file + return text + + +def get_pr_multi_diffs(git_provider: GitProvider, + token_handler: TokenHandler, + model: str, + max_calls: int = 5) -> List[str]: + """ + Retrieves the diff files from a Git provider, sorts them by main language, and generates patches for each file. + The patches are split into multiple groups based on the maximum number of tokens allowed for the given model. + + Args: + git_provider (GitProvider): An object that provides access to Git provider APIs. + token_handler (TokenHandler): An object that handles tokens in the context of a pull request. + model (str): The name of the model. + max_calls (int, optional): The maximum number of calls to retrieve diff files. Defaults to 5. + + Returns: + List[str]: A list of final diff strings, split into multiple groups based on the maximum number of tokens allowed for the given model. + + Raises: + RateLimitExceededException: If the rate limit for the Git provider API is exceeded. + """ + try: + diff_files = git_provider.get_diff_files() + except RateLimitExceededException as e: + logging.error(f"Rate limit exceeded for git provider API. original message {e}") + raise + + # Sort files by main language + pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) + + # Sort files within each language group by tokens in descending order + sorted_files = [] + for lang in pr_languages: + sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True)) + + patches = [] + final_diff_list = [] + total_tokens = token_handler.prompt_tokens + call_number = 1 + for file in sorted_files: + if call_number > max_calls: + if get_settings().config.verbosity_level >= 2: + logging.info(f"Reached max calls ({max_calls})") + break + + original_file_content_str = file.base_file + new_file_content_str = file.head_file + patch = file.patch + if not patch: + continue + + # Remove delete-only hunks + patch = handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file.filename) + if patch is None: + continue + + patch = convert_to_hunks_with_lines_numbers(patch, file) + new_patch_tokens = token_handler.count_tokens(patch) + if patch and (total_tokens + new_patch_tokens > MAX_TOKENS[model] - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD): + final_diff = "\n".join(patches) + final_diff_list.append(final_diff) + patches = [] + total_tokens = token_handler.prompt_tokens + call_number += 1 + if get_settings().config.verbosity_level >= 2: + logging.info(f"Call number: {call_number}") + + if patch: + patches.append(patch) + total_tokens += new_patch_tokens + if get_settings().config.verbosity_level >= 2: + logging.info(f"Tokens: {total_tokens}, last filename: {file.filename}") + + # Add the last chunk + if patches: + final_diff = "\n".join(patches) + final_diff_list.append(final_diff) + + return final_diff_list diff --git a/pr_agent/config_loader.py b/pr_agent/config_loader.py index 3075e8dcd..47edfd975 100644 --- a/pr_agent/config_loader.py +++ b/pr_agent/config_loader.py @@ -19,6 +19,7 @@ "settings/pr_questions_prompts.toml", "settings/pr_description_prompts.toml", "settings/pr_code_suggestions_prompts.toml", + "settings/pr_sort_code_suggestions_prompts.toml", "settings/pr_information_from_user_prompts.toml", "settings/pr_update_changelog_prompts.toml", "settings_prod/.secrets.toml" diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index ce920efd3..00c9b4535 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -32,6 +32,14 @@ extra_instructions = "" num_code_suggestions=4 extra_instructions = "" +[pr_extendeted_code_suggestions] # /extended_improve # +num_code_suggestions_per_chunk=8 +extra_instructions = "" +max_number_of_calls = 5 +rank_suggestions = true +final_clip_factor = 0.5 + + [pr_update_changelog] # /update_changelog # push_changelog_changes=false extra_instructions = "" diff --git a/pr_agent/settings/pr_code_suggestions_prompts.toml b/pr_agent/settings/pr_code_suggestions_prompts.toml index 76a3cb6be..be90c840f 100644 --- a/pr_agent/settings/pr_code_suggestions_prompts.toml +++ b/pr_agent/settings/pr_code_suggestions_prompts.toml @@ -1,19 +1,47 @@ [pr_code_suggestions_prompt] -system="""You are a language model called CodiumAI-PR-Code-Reviewer. -Your task is to provide meaningfull non-trivial code suggestions to improve the new code in a PR (the '+' lines). -- Try to give important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningfull code improvements, like performance, vulnerability, modularity, and best practices. -- Suggestions should refer only to the 'new hunk' code, and focus on improving the new added code lines, with '+'. +system="""You are a language model called PR-Code-Reviewer. +Your task is to provide meaningful actionable code suggestions, to improve the new code presented in a PR. + +Example PR Diff input: +' +## src/file1.py + +--new hunk-- +12 code line that already existed in the file... +13 code line that already existed in the file.... +14 +new code line added in the PR +15 code line that already existed in the file... +16 code line that already existed in the file... + +--old hunk-- + code line that already existed in the file... +-code line that was removed in the PR + code line that already existed in the file... + + +--new hunk-- +... +--old hunk-- +... + + +## src/file2.py +... +' + +Specific instructions: +- Focus on important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningful code improvements, like performance, vulnerability, modularity, and best practices. +- Suggestions should refer only to code from the '--new hunk--' sections, and focus on new lines of code (lines starting with '+'). - Provide the exact line number range (inclusive) for each issue. -- Assume there is additional code in the relevant file that is not included in the diff. +- Assume there is additional relevant code, that is not included in the diff. - Provide up to {{ num_code_suggestions }} code suggestions. -- Make sure not to provide suggestions repeating modifications already implemented in the new PR code (the '+' lines). -- Don't output line numbers in the 'improved code' snippets. +- Avoid making suggestions that have already been implemented in the PR code. For example, if you propose adding a docstring, type hint, or anything else, make sure it isn't already in the '--new hunk--' code. {%- if extra_instructions %} Extra instructions from the user: {{ extra_instructions }} -{% endif %} +{%- endif %} You must use the following JSON schema to format your answer: ```json @@ -30,39 +58,26 @@ You must use the following JSON schema to format your answer: }, "suggestion content": { "type": "string", - "description": "a concrete suggestion for meaningfully improving the new PR code." + "description": "a concrete suggestion for meaningfully improving the new PR code (lines from the '--new hunk--' sections, starting with '+')." }, "existing code": { "type": "string", - "description": "a code snippet showing authentic relevant code lines from a 'new hunk' section. It must be continuous, correctly formatted and indented, and without line numbers." + "description": "a code snippet showing the relevant code lines from a '--new hunk--' section. It must be continuous, correctly formatted and indented, and without line numbers." }, "relevant lines": { "type": "string", - "description": "the relevant lines in the 'new hunk' sections, in the format of 'start_line-end_line'. For example: '10-15'. They should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above." + "description": "the relevant lines from a '--new hunk--' section, in the format of 'start_line-end_line'. For example: '10-15'. They should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above." }, "improved code": { "type": "string", - "description": "a new code snippet that can be used to replace the relevant lines in 'new hunk' code. Replacement suggestions should be complete, correctly formatted and indented, and without line numbers." + "description": "a new code snippet that can be used to replace the relevant lines in '--new hunk--' code. Replacement suggestions should be complete, correctly formatted and indented, and without line numbers." } } } } ``` -Example input: -' -## src/file1.py ----new_hunk--- -``` -[new hunk code, annotated with line numbers] -``` ----old_hunk--- -``` -[old hunk code] -``` -... -' - +Don't output line numbers in the 'improved code' snippets. Don't repeat the prompt in the answer, and avoid outputting the 'type' and 'description' fields. """ diff --git a/pr_agent/settings/pr_reviewer_prompts.toml b/pr_agent/settings/pr_reviewer_prompts.toml index cdf7f731e..b65d62e44 100644 --- a/pr_agent/settings/pr_reviewer_prompts.toml +++ b/pr_agent/settings/pr_reviewer_prompts.toml @@ -1,9 +1,9 @@ [pr_review_prompt] system="""You are CodiumAI-PR-Reviewer, a language model designed to review git pull requests. -Your task is to provide constructive and concise feedback for the PR, and also provide meaningfull code suggestions to improve the new PR code (the '+' lines). +Your task is to provide constructive and concise feedback for the PR, and also provide meaningful code suggestions to improve the new PR code (the '+' lines). {%- if num_code_suggestions > 0 %} - Provide up to {{ num_code_suggestions }} code suggestions. -- Try to focus on the most important suggestions, like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningfull code improvements, like performance, vulnerability, modularity, and best practices. +- Try to focus on the most important suggestions, like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningful code improvements, like performance, vulnerability, modularity, and best practices. - Suggestions should focus on improving the new added code lines. - Make sure not to provide suggestions repeating modifications already implemented in the new PR code (the '+' lines). {%- endif %} diff --git a/pr_agent/settings/pr_sort_code_suggestions_prompts.toml b/pr_agent/settings/pr_sort_code_suggestions_prompts.toml new file mode 100644 index 000000000..16b6e8618 --- /dev/null +++ b/pr_agent/settings/pr_sort_code_suggestions_prompts.toml @@ -0,0 +1,46 @@ +[pr_sort_code_suggestions_prompt] +system=""" +""" + +user="""You are given a list of code suggestions to improve a PR: + +{{ suggestion_str|trim }} + + +Your task is to sort the code suggestions by their order of importance, and return a list with sorting order. +The sorting order is a list of pairs, where each pair contains the index of the suggestion in the original list. +Rank the suggestions based on their importance to improving the PR, with critical issues first and minor issues last. + +You must use the following YAML schema to format your answer: +```yaml +Sort Order: + type: array + maxItems: {{ suggestion_list|length }} + uniqueItems: true + items: + suggestion number: + type: integer + minimum: 1 + maximum: {{ suggestion_list|length }} + importance order: + type: integer + minimum: 1 + maximum: {{ suggestion_list|length }} +``` + +Example output: +```yaml +Sort Order: + - suggestion number: 1 + importance order: 2 + - suggestion number: 2 + importance order: 3 + - suggestion number: 3 + importance order: 1 +``` + +Make sure to output a valid YAML. Use multi-line block scalar ('|') if needed. +Don't repeat the prompt in the answer, and avoid outputting the 'type' and 'description' fields. +Response (should be a valid YAML, and nothing else): +```yaml +""" diff --git a/pr_agent/tools/pr_extended_code_suggestions.py b/pr_agent/tools/pr_extended_code_suggestions.py new file mode 100644 index 000000000..17f7b570f --- /dev/null +++ b/pr_agent/tools/pr_extended_code_suggestions.py @@ -0,0 +1,215 @@ +from typing import List +import copy +import json +import logging +import textwrap +from typing import Dict, Any + +import yaml +from jinja2 import Environment, StrictUndefined + +from pr_agent.algo.ai_handler import AiHandler +from pr_agent.algo.pr_processing import get_pr_multi_diffs, retry_with_fallback_models +from pr_agent.algo.token_handler import TokenHandler +from pr_agent.algo.utils import try_fix_json +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import get_git_provider +from pr_agent.git_providers.git_provider import get_main_pr_language + + +class PRExtendedCodeSuggestions: + def __init__(self, pr_url: str, cli_mode=False, args: list = None): + + self.git_provider = get_git_provider()(pr_url) + self.main_language = get_main_pr_language( + self.git_provider.get_languages(), self.git_provider.get_files() + ) + + self.ai_handler = AiHandler() + self.patches_diff = None + self.prediction = None + self.cli_mode = cli_mode + self.vars = { + "title": self.git_provider.pr.title, + "branch": self.git_provider.get_pr_branch(), + "description": self.git_provider.get_pr_description(), + "language": self.main_language, + "diff": "", # empty diff for initial calculation + "num_code_suggestions": get_settings().pr_extendeted_code_suggestions.num_code_suggestions_per_chunk, + "extra_instructions": get_settings().pr_extendeted_code_suggestions.extra_instructions, + "commit_messages_str": self.git_provider.get_commit_messages(), + } + self.token_handler = TokenHandler(self.git_provider.pr, + self.vars, + get_settings().pr_code_suggestions_prompt.system, + get_settings().pr_code_suggestions_prompt.user) + + async def run(self): + logging.info('Generating code suggestions for PR...') + if get_settings().config.publish_output: + self.git_provider.publish_comment("Preparing review...", is_temporary=True) + data = await retry_with_fallback_models(self._prepare_prediction) + + if get_settings().pr_extendeted_code_suggestions.rank_suggestions: + logging.info('Ranking Suggestions...') + data['Code suggestions'] = await self.rank_suggestions(data['Code suggestions']) + + logging.info('Preparing PR review...') + if get_settings().config.publish_output: + logging.info('Pushing PR review...') + self.git_provider.remove_initial_comment() + logging.info('Pushing inline code comments...') + self.push_inline_code_suggestions(data) + + async def _prepare_prediction(self, model: str) -> dict: + logging.info('Getting PR diff...') + patches_diff_list = get_pr_multi_diffs(self.git_provider, self.token_handler, model, + max_calls=get_settings().pr_extendeted_code_suggestions.max_number_of_calls) + + logging.info('Getting multi AI predictions...') + prediction_list = [] + for i, patches_diff in enumerate(patches_diff_list): + logging.info(f"Processing chunk {i + 1} of {len(patches_diff_list)}") + self.patches_diff = patches_diff + prediction = await self._get_prediction(model) + prediction_list.append(prediction) + self.prediction_list = prediction_list + + data = {} + for prediction in prediction_list: + self.prediction = prediction + data_per_chunk = self._prepare_pr_code_suggestions() + if "Code suggestions" in data: + data["Code suggestions"].extend(data_per_chunk["Code suggestions"]) + else: + data.update(data_per_chunk) + self.data = data + return data + + async def _get_prediction(self, model: str): + variables = copy.deepcopy(self.vars) + variables["diff"] = self.patches_diff # update diff + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(get_settings().pr_code_suggestions_prompt.system).render(variables) + user_prompt = environment.from_string(get_settings().pr_code_suggestions_prompt.user).render(variables) + if get_settings().config.verbosity_level >= 2: + logging.info(f"\nSystem prompt:\n{system_prompt}") + logging.info(f"\nUser prompt:\n{user_prompt}") + response, finish_reason = await self.ai_handler.chat_completion(model=model, temperature=0.2, + system=system_prompt, user=user_prompt) + + return response + + def _prepare_pr_code_suggestions(self) -> str: + review = self.prediction.strip() + try: + data = json.loads(review) + except json.decoder.JSONDecodeError: + if get_settings().config.verbosity_level >= 1: + logging.info(f"Could not parse json response: {review}") + data = try_fix_json(review, code_suggestions=True) + return data + + def push_inline_code_suggestions(self, data): + code_suggestions = [] + + if not data['Code suggestions']: + return self.git_provider.publish_comment('No suggestions found to improve this PR.') + + for d in data['Code suggestions']: + try: + if get_settings().config.verbosity_level >= 1: + logging.info(f"suggestion: {d}") + relevant_file = d['relevant file'].strip() + relevant_lines_str = d['relevant lines'].strip() + if ',' in relevant_lines_str: # handling 'relevant lines': '181, 190' or '178-184, 188-194' + relevant_lines_str = relevant_lines_str.split(',')[0] + relevant_lines_start = int(relevant_lines_str.split('-')[0]) # absolute position + relevant_lines_end = int(relevant_lines_str.split('-')[-1]) + content = d['suggestion content'] + new_code_snippet = d['improved code'] + + if new_code_snippet: + new_code_snippet = self.dedent_code(relevant_file, relevant_lines_start, new_code_snippet) + + body = f"**Suggestion:** {content}\n```suggestion\n" + new_code_snippet + "\n```" + code_suggestions.append({'body': body, 'relevant_file': relevant_file, + 'relevant_lines_start': relevant_lines_start, + 'relevant_lines_end': relevant_lines_end}) + except Exception: + if get_settings().config.verbosity_level >= 1: + logging.info(f"Could not parse suggestion: {d}") + + self.git_provider.publish_code_suggestions(code_suggestions) + + def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet): + try: # dedent code snippet + self.diff_files = self.git_provider.diff_files if self.git_provider.diff_files \ + else self.git_provider.get_diff_files() + original_initial_line = None + for file in self.diff_files: + if file.filename.strip() == relevant_file: + original_initial_line = file.head_file.splitlines()[relevant_lines_start - 1] + break + if original_initial_line: + suggested_initial_line = new_code_snippet.splitlines()[0] + original_initial_spaces = len(original_initial_line) - len(original_initial_line.lstrip()) + suggested_initial_spaces = len(suggested_initial_line) - len(suggested_initial_line.lstrip()) + delta_spaces = original_initial_spaces - suggested_initial_spaces + if delta_spaces > 0: + new_code_snippet = textwrap.indent(new_code_snippet, delta_spaces * " ").rstrip('\n') + except Exception as e: + if get_settings().config.verbosity_level >= 1: + logging.info(f"Could not dedent code snippet for file {relevant_file}, error: {e}") + + return new_code_snippet + + async def rank_suggestions(self, data: List) -> List: + """ + Call a model to rank (sort) code suggestions based on their importance order. + + Args: + data (List): A list of code suggestions to be ranked. + + Returns: + List: The ranked list of code suggestions. + """ + + suggestion_list = [] + # remove invalid suggestions + for i, suggestion in enumerate(data): + if suggestion['existing code'] != suggestion['improved code']: + suggestion_list.append(suggestion) + + data_sorted = [[]] * len(suggestion_list) + + try: + suggestion_str = "" + for i, suggestion in enumerate(suggestion_list): + suggestion_str += f"suggestion {i + 1}: " + str(suggestion) + '\n\n' + + variables = {'suggestion_list': suggestion_list, 'suggestion_str': suggestion_str} + model = get_settings().config.model + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.system).render(variables) + user_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.user).render(variables) + if get_settings().config.verbosity_level >= 2: + logging.info(f"\nSystem prompt:\n{system_prompt}") + logging.info(f"\nUser prompt:\n{user_prompt}") + response, finish_reason = await self.ai_handler.chat_completion(model=model, system=system_prompt, user=user_prompt) + + sort_order = yaml.safe_load(response) + for s in sort_order['Sort Order']: + suggestion_number = s['suggestion number'] + importance_order = s['importance order'] + data_sorted[importance_order - 1] = suggestion_list[suggestion_number - 1] + + if get_settings().pr_extendeted_code_suggestions.final_clip_factor != 1: + new_len = int(0.5 + len(data_sorted) * get_settings().pr_extendeted_code_suggestions.final_clip_factor) + data_sorted = data_sorted[:new_len] + except Exception as e: + if get_settings().config.verbosity_level >= 1: + logging.info(f"Could not sort suggestions, error: {e}") + data_sorted = suggestion_list + + return data_sorted From b85679e5e44e212a1e3ea3a171492f85477cdb9a Mon Sep 17 00:00:00 2001 From: mrT23 Date: Tue, 22 Aug 2023 09:42:59 +0300 Subject: [PATCH 2/5] improve --extend --- pr_agent/agent/pr_agent.py | 2 - pr_agent/algo/utils.py | 3 +- pr_agent/cli.py | 20 +- pr_agent/servers/help.py | 2 +- pr_agent/settings/configuration.toml | 10 +- pr_agent/tools/pr_code_suggestions.py | 107 ++++++++- .../tools/pr_extended_code_suggestions.py | 215 ------------------ 7 files changed, 122 insertions(+), 237 deletions(-) delete mode 100644 pr_agent/tools/pr_extended_code_suggestions.py diff --git a/pr_agent/agent/pr_agent.py b/pr_agent/agent/pr_agent.py index f722695c0..70121f3ca 100644 --- a/pr_agent/agent/pr_agent.py +++ b/pr_agent/agent/pr_agent.py @@ -11,7 +11,6 @@ from pr_agent.tools.pr_information_from_user import PRInformationFromUser from pr_agent.tools.pr_questions import PRQuestions from pr_agent.tools.pr_reviewer import PRReviewer -from pr_agent.tools.pr_extended_code_suggestions import PRExtendedCodeSuggestions from pr_agent.tools.pr_update_changelog import PRUpdateChangelog from pr_agent.tools.pr_config import PRConfig @@ -26,7 +25,6 @@ "describe_pr": PRDescription, "improve": PRCodeSuggestions, "improve_code": PRCodeSuggestions, - "extended_improve": PRExtendedCodeSuggestions, "ask": PRQuestions, "ask_question": PRQuestions, "update_changelog": PRUpdateChangelog, diff --git a/pr_agent/algo/utils.py b/pr_agent/algo/utils.py index 87e206cf0..2139203f6 100644 --- a/pr_agent/algo/utils.py +++ b/pr_agent/algo/utils.py @@ -247,7 +247,8 @@ def update_settings_from_args(args: List[str]) -> List[str]: arg = arg.strip('-').strip() vals = arg.split('=', 1) if len(vals) != 2: - logging.error(f'Invalid argument format: {arg}') + if len(vals) > 2: # --extended is a valid argument + logging.error(f'Invalid argument format: {arg}') other_args.append(arg) continue key, value = _fix_key_value(*vals) diff --git a/pr_agent/cli.py b/pr_agent/cli.py index 0f8710419..01c1a7ec5 100644 --- a/pr_agent/cli.py +++ b/pr_agent/cli.py @@ -19,13 +19,21 @@ def run(inargs=None): - cli.py --pr_url=... reflect Supported commands: -review / review_pr - Add a review that includes a summary of the PR and specific suggestions for improvement. -ask / ask_question [question] - Ask a question about the PR. -describe / describe_pr - Modify the PR title and description based on the PR's contents. -improve / improve_code - Suggest improvements to the code in the PR as pull request comments ready to commit. -reflect - Ask the PR author questions about the PR. -update_changelog - Update the changelog based on the PR's contents. +-review / review_pr - Add a review that includes a summary of the PR and specific suggestions for improvement. +-ask / ask_question [question] - Ask a question about the PR. + +-describe / describe_pr - Modify the PR title and description based on the PR's contents. + +-improve / improve_code - Suggest improvements to the code in the PR as pull request comments ready to commit. +Extended mode ('improve --extended') employs several calls, and provides a more thorough feedback + +-reflect - Ask the PR author questions about the PR. + +-update_changelog - Update the changelog based on the PR's contents. + + +Configuration: To edit any configuration parameter from 'configuration.toml', just add -config_path=. For example: 'python cli.py --pr_url=... review --pr_reviewer.extra_instructions="focus on the file: ..."' """) diff --git a/pr_agent/servers/help.py b/pr_agent/servers/help.py index 1ee7fc4d5..0f3f3caaf 100644 --- a/pr_agent/servers/help.py +++ b/pr_agent/servers/help.py @@ -1,7 +1,7 @@ commands_text = "> **/review [-i]**: Request a review of your Pull Request. For an incremental review, which only " \ "considers changes since the last review, include the '-i' option.\n" \ "> **/describe**: Modify the PR title and description based on the contents of the PR.\n" \ - "> **/improve**: Suggest improvements to the code in the PR. \n" \ + "> **/improve [--extended]**: Suggest improvements to the code in the PR. Extended mode employs several calls, and provides a more thorough feedback. \n" \ "> **/ask \\**: Pose a question about the PR.\n" \ "> **/update_changelog**: Update the changelog based on the PR's contents.\n\n" \ ">To edit any configuration parameter from **configuration.toml**, add --config_path=new_value\n" \ diff --git a/pr_agent/settings/configuration.toml b/pr_agent/settings/configuration.toml index 00c9b4535..b1d19f979 100644 --- a/pr_agent/settings/configuration.toml +++ b/pr_agent/settings/configuration.toml @@ -31,14 +31,12 @@ extra_instructions = "" [pr_code_suggestions] # /improve # num_code_suggestions=4 extra_instructions = "" - -[pr_extendeted_code_suggestions] # /extended_improve # +rank_suggestions = false +# params for '/improve --extended' mode num_code_suggestions_per_chunk=8 -extra_instructions = "" +rank_extended_suggestions = true max_number_of_calls = 5 -rank_suggestions = true -final_clip_factor = 0.5 - +final_clip_factor = 0.9 [pr_update_changelog] # /update_changelog # push_changelog_changes=false diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py index ebb88b211..4dc2f4008 100644 --- a/pr_agent/tools/pr_code_suggestions.py +++ b/pr_agent/tools/pr_code_suggestions.py @@ -2,11 +2,13 @@ import json import logging import textwrap +from typing import List +import yaml from jinja2 import Environment, StrictUndefined from pr_agent.algo.ai_handler import AiHandler -from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models +from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models, get_pr_multi_diffs from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import try_fix_json from pr_agent.config_loader import get_settings @@ -22,6 +24,13 @@ def __init__(self, pr_url: str, cli_mode=False, args: list = None): self.git_provider.get_languages(), self.git_provider.get_files() ) + # extended mode + self.is_extended = any(["extended" in arg for arg in args]) + if self.is_extended: + num_code_suggestions = get_settings().pr_code_suggestions.num_code_suggestions_per_chunk + else: + num_code_suggestions = get_settings().pr_code_suggestions.num_code_suggestions + self.ai_handler = AiHandler() self.patches_diff = None self.prediction = None @@ -32,7 +41,7 @@ def __init__(self, pr_url: str, cli_mode=False, args: list = None): "description": self.git_provider.get_pr_description(), "language": self.main_language, "diff": "", # empty diff for initial calculation - "num_code_suggestions": get_settings().pr_code_suggestions.num_code_suggestions, + "num_code_suggestions": num_code_suggestions, "extra_instructions": get_settings().pr_code_suggestions.extra_instructions, "commit_messages_str": self.git_provider.get_commit_messages(), } @@ -42,14 +51,22 @@ def __init__(self, pr_url: str, cli_mode=False, args: list = None): get_settings().pr_code_suggestions_prompt.user) async def run(self): - # assert type(self.git_provider) != BitbucketProvider, "Bitbucket is not supported for now" - logging.info('Generating code suggestions for PR...') if get_settings().config.publish_output: self.git_provider.publish_comment("Preparing review...", is_temporary=True) - await retry_with_fallback_models(self._prepare_prediction) + logging.info('Preparing PR review...') - data = self._prepare_pr_code_suggestions() + if not self.is_extended: + await retry_with_fallback_models(self._prepare_prediction) + data = self._prepare_pr_code_suggestions() + else: + data = await retry_with_fallback_models(self._prepare_prediction_extended) + + if (not self.is_extended and get_settings().pr_code_suggestions.rank_suggestions) or \ + (self.is_extended and get_settings().pr_code_suggestions.rank_extended_suggestions): + logging.info('Ranking Suggestions...') + data['Code suggestions'] = await self.rank_suggestions(data['Code suggestions']) + if get_settings().config.publish_output: logging.info('Pushing PR review...') self.git_provider.remove_initial_comment() @@ -145,3 +162,81 @@ def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet): return new_code_snippet + async def _prepare_prediction_extended(self, model: str) -> dict: + logging.info('Getting PR diff...') + patches_diff_list = get_pr_multi_diffs(self.git_provider, self.token_handler, model, + max_calls=get_settings().pr_code_suggestions.max_number_of_calls) + + logging.info('Getting multi AI predictions...') + prediction_list = [] + for i, patches_diff in enumerate(patches_diff_list): + logging.info(f"Processing chunk {i + 1} of {len(patches_diff_list)}") + self.patches_diff = patches_diff + prediction = await self._get_prediction(model) + prediction_list.append(prediction) + self.prediction_list = prediction_list + + data = {} + for prediction in prediction_list: + self.prediction = prediction + data_per_chunk = self._prepare_pr_code_suggestions() + if "Code suggestions" in data: + data["Code suggestions"].extend(data_per_chunk["Code suggestions"]) + else: + data.update(data_per_chunk) + self.data = data + return data + + async def rank_suggestions(self, data: List) -> List: + """ + Call a model to rank (sort) code suggestions based on their importance order. + + Args: + data (List): A list of code suggestions to be ranked. + + Returns: + List: The ranked list of code suggestions. + """ + + suggestion_list = [] + # remove invalid suggestions + for i, suggestion in enumerate(data): + if suggestion['existing code'] != suggestion['improved code']: + suggestion_list.append(suggestion) + + data_sorted = [[]] * len(suggestion_list) + + try: + suggestion_str = "" + for i, suggestion in enumerate(suggestion_list): + suggestion_str += f"suggestion {i + 1}: " + str(suggestion) + '\n\n' + + variables = {'suggestion_list': suggestion_list, 'suggestion_str': suggestion_str} + model = get_settings().config.model + environment = Environment(undefined=StrictUndefined) + system_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.system).render( + variables) + user_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.user).render(variables) + if get_settings().config.verbosity_level >= 2: + logging.info(f"\nSystem prompt:\n{system_prompt}") + logging.info(f"\nUser prompt:\n{user_prompt}") + response, finish_reason = await self.ai_handler.chat_completion(model=model, system=system_prompt, + user=user_prompt) + + sort_order = yaml.safe_load(response) + for s in sort_order['Sort Order']: + suggestion_number = s['suggestion number'] + importance_order = s['importance order'] + data_sorted[importance_order - 1] = suggestion_list[suggestion_number - 1] + + if get_settings().pr_extendeted_code_suggestions.final_clip_factor != 1: + new_len = int(0.5 + len(data_sorted) * get_settings().pr_extendeted_code_suggestions.final_clip_factor) + data_sorted = data_sorted[:new_len] + except Exception as e: + if get_settings().config.verbosity_level >= 1: + logging.info(f"Could not sort suggestions, error: {e}") + data_sorted = suggestion_list + + return data_sorted + + diff --git a/pr_agent/tools/pr_extended_code_suggestions.py b/pr_agent/tools/pr_extended_code_suggestions.py deleted file mode 100644 index 17f7b570f..000000000 --- a/pr_agent/tools/pr_extended_code_suggestions.py +++ /dev/null @@ -1,215 +0,0 @@ -from typing import List -import copy -import json -import logging -import textwrap -from typing import Dict, Any - -import yaml -from jinja2 import Environment, StrictUndefined - -from pr_agent.algo.ai_handler import AiHandler -from pr_agent.algo.pr_processing import get_pr_multi_diffs, retry_with_fallback_models -from pr_agent.algo.token_handler import TokenHandler -from pr_agent.algo.utils import try_fix_json -from pr_agent.config_loader import get_settings -from pr_agent.git_providers import get_git_provider -from pr_agent.git_providers.git_provider import get_main_pr_language - - -class PRExtendedCodeSuggestions: - def __init__(self, pr_url: str, cli_mode=False, args: list = None): - - self.git_provider = get_git_provider()(pr_url) - self.main_language = get_main_pr_language( - self.git_provider.get_languages(), self.git_provider.get_files() - ) - - self.ai_handler = AiHandler() - self.patches_diff = None - self.prediction = None - self.cli_mode = cli_mode - self.vars = { - "title": self.git_provider.pr.title, - "branch": self.git_provider.get_pr_branch(), - "description": self.git_provider.get_pr_description(), - "language": self.main_language, - "diff": "", # empty diff for initial calculation - "num_code_suggestions": get_settings().pr_extendeted_code_suggestions.num_code_suggestions_per_chunk, - "extra_instructions": get_settings().pr_extendeted_code_suggestions.extra_instructions, - "commit_messages_str": self.git_provider.get_commit_messages(), - } - self.token_handler = TokenHandler(self.git_provider.pr, - self.vars, - get_settings().pr_code_suggestions_prompt.system, - get_settings().pr_code_suggestions_prompt.user) - - async def run(self): - logging.info('Generating code suggestions for PR...') - if get_settings().config.publish_output: - self.git_provider.publish_comment("Preparing review...", is_temporary=True) - data = await retry_with_fallback_models(self._prepare_prediction) - - if get_settings().pr_extendeted_code_suggestions.rank_suggestions: - logging.info('Ranking Suggestions...') - data['Code suggestions'] = await self.rank_suggestions(data['Code suggestions']) - - logging.info('Preparing PR review...') - if get_settings().config.publish_output: - logging.info('Pushing PR review...') - self.git_provider.remove_initial_comment() - logging.info('Pushing inline code comments...') - self.push_inline_code_suggestions(data) - - async def _prepare_prediction(self, model: str) -> dict: - logging.info('Getting PR diff...') - patches_diff_list = get_pr_multi_diffs(self.git_provider, self.token_handler, model, - max_calls=get_settings().pr_extendeted_code_suggestions.max_number_of_calls) - - logging.info('Getting multi AI predictions...') - prediction_list = [] - for i, patches_diff in enumerate(patches_diff_list): - logging.info(f"Processing chunk {i + 1} of {len(patches_diff_list)}") - self.patches_diff = patches_diff - prediction = await self._get_prediction(model) - prediction_list.append(prediction) - self.prediction_list = prediction_list - - data = {} - for prediction in prediction_list: - self.prediction = prediction - data_per_chunk = self._prepare_pr_code_suggestions() - if "Code suggestions" in data: - data["Code suggestions"].extend(data_per_chunk["Code suggestions"]) - else: - data.update(data_per_chunk) - self.data = data - return data - - async def _get_prediction(self, model: str): - variables = copy.deepcopy(self.vars) - variables["diff"] = self.patches_diff # update diff - environment = Environment(undefined=StrictUndefined) - system_prompt = environment.from_string(get_settings().pr_code_suggestions_prompt.system).render(variables) - user_prompt = environment.from_string(get_settings().pr_code_suggestions_prompt.user).render(variables) - if get_settings().config.verbosity_level >= 2: - logging.info(f"\nSystem prompt:\n{system_prompt}") - logging.info(f"\nUser prompt:\n{user_prompt}") - response, finish_reason = await self.ai_handler.chat_completion(model=model, temperature=0.2, - system=system_prompt, user=user_prompt) - - return response - - def _prepare_pr_code_suggestions(self) -> str: - review = self.prediction.strip() - try: - data = json.loads(review) - except json.decoder.JSONDecodeError: - if get_settings().config.verbosity_level >= 1: - logging.info(f"Could not parse json response: {review}") - data = try_fix_json(review, code_suggestions=True) - return data - - def push_inline_code_suggestions(self, data): - code_suggestions = [] - - if not data['Code suggestions']: - return self.git_provider.publish_comment('No suggestions found to improve this PR.') - - for d in data['Code suggestions']: - try: - if get_settings().config.verbosity_level >= 1: - logging.info(f"suggestion: {d}") - relevant_file = d['relevant file'].strip() - relevant_lines_str = d['relevant lines'].strip() - if ',' in relevant_lines_str: # handling 'relevant lines': '181, 190' or '178-184, 188-194' - relevant_lines_str = relevant_lines_str.split(',')[0] - relevant_lines_start = int(relevant_lines_str.split('-')[0]) # absolute position - relevant_lines_end = int(relevant_lines_str.split('-')[-1]) - content = d['suggestion content'] - new_code_snippet = d['improved code'] - - if new_code_snippet: - new_code_snippet = self.dedent_code(relevant_file, relevant_lines_start, new_code_snippet) - - body = f"**Suggestion:** {content}\n```suggestion\n" + new_code_snippet + "\n```" - code_suggestions.append({'body': body, 'relevant_file': relevant_file, - 'relevant_lines_start': relevant_lines_start, - 'relevant_lines_end': relevant_lines_end}) - except Exception: - if get_settings().config.verbosity_level >= 1: - logging.info(f"Could not parse suggestion: {d}") - - self.git_provider.publish_code_suggestions(code_suggestions) - - def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet): - try: # dedent code snippet - self.diff_files = self.git_provider.diff_files if self.git_provider.diff_files \ - else self.git_provider.get_diff_files() - original_initial_line = None - for file in self.diff_files: - if file.filename.strip() == relevant_file: - original_initial_line = file.head_file.splitlines()[relevant_lines_start - 1] - break - if original_initial_line: - suggested_initial_line = new_code_snippet.splitlines()[0] - original_initial_spaces = len(original_initial_line) - len(original_initial_line.lstrip()) - suggested_initial_spaces = len(suggested_initial_line) - len(suggested_initial_line.lstrip()) - delta_spaces = original_initial_spaces - suggested_initial_spaces - if delta_spaces > 0: - new_code_snippet = textwrap.indent(new_code_snippet, delta_spaces * " ").rstrip('\n') - except Exception as e: - if get_settings().config.verbosity_level >= 1: - logging.info(f"Could not dedent code snippet for file {relevant_file}, error: {e}") - - return new_code_snippet - - async def rank_suggestions(self, data: List) -> List: - """ - Call a model to rank (sort) code suggestions based on their importance order. - - Args: - data (List): A list of code suggestions to be ranked. - - Returns: - List: The ranked list of code suggestions. - """ - - suggestion_list = [] - # remove invalid suggestions - for i, suggestion in enumerate(data): - if suggestion['existing code'] != suggestion['improved code']: - suggestion_list.append(suggestion) - - data_sorted = [[]] * len(suggestion_list) - - try: - suggestion_str = "" - for i, suggestion in enumerate(suggestion_list): - suggestion_str += f"suggestion {i + 1}: " + str(suggestion) + '\n\n' - - variables = {'suggestion_list': suggestion_list, 'suggestion_str': suggestion_str} - model = get_settings().config.model - environment = Environment(undefined=StrictUndefined) - system_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.system).render(variables) - user_prompt = environment.from_string(get_settings().pr_sort_code_suggestions_prompt.user).render(variables) - if get_settings().config.verbosity_level >= 2: - logging.info(f"\nSystem prompt:\n{system_prompt}") - logging.info(f"\nUser prompt:\n{user_prompt}") - response, finish_reason = await self.ai_handler.chat_completion(model=model, system=system_prompt, user=user_prompt) - - sort_order = yaml.safe_load(response) - for s in sort_order['Sort Order']: - suggestion_number = s['suggestion number'] - importance_order = s['importance order'] - data_sorted[importance_order - 1] = suggestion_list[suggestion_number - 1] - - if get_settings().pr_extendeted_code_suggestions.final_clip_factor != 1: - new_len = int(0.5 + len(data_sorted) * get_settings().pr_extendeted_code_suggestions.final_clip_factor) - data_sorted = data_sorted[:new_len] - except Exception as e: - if get_settings().config.verbosity_level >= 1: - logging.info(f"Could not sort suggestions, error: {e}") - data_sorted = suggestion_list - - return data_sorted From f4f040bf8d8443224f083d4cb544c206de9767fc Mon Sep 17 00:00:00 2001 From: mrT23 Date: Tue, 22 Aug 2023 16:11:51 +0300 Subject: [PATCH 3/5] publish each suggestion separably --- pr_agent/algo/git_patch_processing.py | 24 ++++++++++++------- pr_agent/algo/pr_processing.py | 10 ++++---- .../settings/pr_code_suggestions_prompts.toml | 24 ++++++++++--------- pr_agent/tools/pr_code_suggestions.py | 12 ++++++---- 4 files changed, 42 insertions(+), 28 deletions(-) diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py index 230df41e4..1a2bd22bd 100644 --- a/pr_agent/algo/git_patch_processing.py +++ b/pr_agent/algo/git_patch_processing.py @@ -1,5 +1,4 @@ from __future__ import annotations - import logging import re @@ -157,7 +156,7 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: example output: ## src/file.ts ---new hunk-- +__new hunk__ 881 line1 882 line2 883 line3 @@ -166,7 +165,7 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: 889 line6 890 line7 ... ---old hunk-- +__old hunk__ line1 line2 - line3 @@ -177,7 +176,6 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: """ patch_with_lines_str = f"\n\n## {file.filename}\n" - import re patch_lines = patch.splitlines() RE_HUNK_HEADER = re.compile( r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") @@ -185,23 +183,30 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: old_content_lines = [] match = None start1, size1, start2, size2 = -1, -1, -1, -1 + prev_header_line = [] + header_line =[] for line in patch_lines: if 'no newline at end of file' in line.lower(): continue if line.startswith('@@'): + header_line = line match = RE_HUNK_HEADER.match(line) if match and new_content_lines: # found a new hunk, split the previous lines if new_content_lines: - patch_with_lines_str += '\n--new hunk--\n' + if prev_header_line: + patch_with_lines_str += f'\n{prev_header_line}\n' + patch_with_lines_str += '__new hunk__\n' for i, line_new in enumerate(new_content_lines): patch_with_lines_str += f"{start2 + i} {line_new}\n" if old_content_lines: - patch_with_lines_str += '--old hunk--\n' + patch_with_lines_str += '__old hunk__\n' for line_old in old_content_lines: patch_with_lines_str += f"{line_old}\n" new_content_lines = [] old_content_lines = [] + if match: + prev_header_line = header_line try: start1, size1, start2, size2 = map(int, match.groups()[:4]) except: # '@@ -0,0 +1 @@' case @@ -219,12 +224,13 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str: # finishing last hunk if match and new_content_lines: if new_content_lines: - patch_with_lines_str += '\n--new hunk--\n' + patch_with_lines_str += f'\n{header_line}\n' + patch_with_lines_str += '\n__new hunk__\n' for i, line_new in enumerate(new_content_lines): patch_with_lines_str += f"{start2 + i} {line_new}\n" if old_content_lines: - patch_with_lines_str += '\n--old hunk--\n' + patch_with_lines_str += '\n__old hunk__\n' for line_old in old_content_lines: patch_with_lines_str += f"{line_old}\n" - return patch_with_lines_str.strip() + return patch_with_lines_str.rstrip() diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py index 1003f4566..29709d29c 100644 --- a/pr_agent/algo/pr_processing.py +++ b/pr_agent/algo/pr_processing.py @@ -24,7 +24,7 @@ PATCH_EXTRA_LINES = 3 def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: str, - add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False) -> str: + add_line_numbers_to_hunks: bool = True, disable_extra_lines: bool = True) -> str: """ Returns a string with the diff of the pull request, applying diff minimization techniques if needed. @@ -103,9 +103,9 @@ def pr_generate_extended_diff(pr_languages: list, # extend each patch with extra lines of context extended_patch = extend_patch(original_file_content_str, patch, num_lines=PATCH_EXTRA_LINES) - full_extended_patch = f"## {file.filename}\n\n{extended_patch}\n" + full_extended_patch = f"\n\n## {file.filename}\n\n{extended_patch}\n" - if add_line_numbers_to_hunks and PATCH_EXTRA_LINES > 0: + if add_line_numbers_to_hunks: full_extended_patch = convert_to_hunks_with_lines_numbers(extended_patch, file) patch_tokens = token_handler.count_tokens(full_extended_patch) @@ -322,7 +322,9 @@ def clip_tokens(text: str, max_tokens: int) -> str: Returns: str: The clipped string. """ - # We'll estimate the number of tokens by hueristically assuming 2.5 tokens per word + if not text: + return text + try: encoder = get_token_encoder() num_input_tokens = len(encoder.encode(text)) diff --git a/pr_agent/settings/pr_code_suggestions_prompts.toml b/pr_agent/settings/pr_code_suggestions_prompts.toml index be90c840f..e32ffd728 100644 --- a/pr_agent/settings/pr_code_suggestions_prompts.toml +++ b/pr_agent/settings/pr_code_suggestions_prompts.toml @@ -6,22 +6,23 @@ Example PR Diff input: ' ## src/file1.py ---new hunk-- +@@ -12,3 +12,5 @@ def func1(): +__new hunk__ 12 code line that already existed in the file... 13 code line that already existed in the file.... 14 +new code line added in the PR 15 code line that already existed in the file... 16 code line that already existed in the file... - ---old hunk-- +__old hunk__ code line that already existed in the file... -code line that was removed in the PR code line that already existed in the file... ---new hunk-- +@@ ... @@ def func2(): +__new hunk__ ... ---old hunk-- +__old hunk__ ... @@ -31,11 +32,12 @@ Example PR Diff input: Specific instructions: - Focus on important suggestions like fixing code problems, issues and bugs. As a second priority, provide suggestions for meaningful code improvements, like performance, vulnerability, modularity, and best practices. -- Suggestions should refer only to code from the '--new hunk--' sections, and focus on new lines of code (lines starting with '+'). +- Suggestions should refer only to code from the '__new hunk__' sections, and focus on new lines of code (lines starting with '+'). - Provide the exact line number range (inclusive) for each issue. - Assume there is additional relevant code, that is not included in the diff. - Provide up to {{ num_code_suggestions }} code suggestions. -- Avoid making suggestions that have already been implemented in the PR code. For example, if you propose adding a docstring, type hint, or anything else, make sure it isn't already in the '--new hunk--' code. +- Avoid making suggestions that have already been implemented in the PR code. For example, if you propose adding a docstring, type hint, or anything else, make sure it isn't already in the '__new hunk__' code. +- Don't suggest to add docstring or type hints. {%- if extra_instructions %} @@ -58,19 +60,19 @@ You must use the following JSON schema to format your answer: }, "suggestion content": { "type": "string", - "description": "a concrete suggestion for meaningfully improving the new PR code (lines from the '--new hunk--' sections, starting with '+')." + "description": "a concrete suggestion for meaningfully improving the new PR code (lines from the '__new hunk__' sections, starting with '+')." }, "existing code": { "type": "string", - "description": "a code snippet showing the relevant code lines from a '--new hunk--' section. It must be continuous, correctly formatted and indented, and without line numbers." + "description": "a code snippet showing the relevant code lines from a '__new hunk__' section. It must be continuous, correctly formatted and indented, and without line numbers." }, "relevant lines": { "type": "string", - "description": "the relevant lines from a '--new hunk--' section, in the format of 'start_line-end_line'. For example: '10-15'. They should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above." + "description": "the relevant lines from a '__new hunk__' section, in the format of 'start_line-end_line'. For example: '10-15'. They should be derived from the hunk line numbers, and correspond to the 'existing code' snippet above." }, "improved code": { "type": "string", - "description": "a new code snippet that can be used to replace the relevant lines in '--new hunk--' code. Replacement suggestions should be complete, correctly formatted and indented, and without line numbers." + "description": "a new code snippet that can be used to replace the relevant lines in '__new hunk__' code. Replacement suggestions should be complete, correctly formatted and indented, and without line numbers." } } } diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py index 4dc2f4008..cc787f5e4 100644 --- a/pr_agent/tools/pr_code_suggestions.py +++ b/pr_agent/tools/pr_code_suggestions.py @@ -70,7 +70,7 @@ async def run(self): if get_settings().config.publish_output: logging.info('Pushing PR review...') self.git_provider.remove_initial_comment() - logging.info('Pushing inline code comments...') + logging.info('Pushing inline code suggestions...') self.push_inline_code_suggestions(data) async def _prepare_prediction(self, model: str): @@ -138,7 +138,11 @@ def push_inline_code_suggestions(self, data): if get_settings().config.verbosity_level >= 2: logging.info(f"Could not parse suggestion: {d}") - self.git_provider.publish_code_suggestions(code_suggestions) + is_successful = self.git_provider.publish_code_suggestions(code_suggestions) + if not is_successful: + logging.info("Failed to publish code suggestions, trying to publish each suggestion separately") + for code_suggestion in code_suggestions: + self.git_provider.publish_code_suggestions([code_suggestion]) def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet): try: # dedent code snippet @@ -229,8 +233,8 @@ async def rank_suggestions(self, data: List) -> List: importance_order = s['importance order'] data_sorted[importance_order - 1] = suggestion_list[suggestion_number - 1] - if get_settings().pr_extendeted_code_suggestions.final_clip_factor != 1: - new_len = int(0.5 + len(data_sorted) * get_settings().pr_extendeted_code_suggestions.final_clip_factor) + if get_settings().pr_code_suggestions.final_clip_factor != 1: + new_len = int(0.5 + len(data_sorted) * get_settings().pr_code_suggestions.final_clip_factor) data_sorted = data_sorted[:new_len] except Exception as e: if get_settings().config.verbosity_level >= 1: From 36e5e5a17e559a340df961517f506d62fb41bd97 Mon Sep 17 00:00:00 2001 From: mrT23 Date: Tue, 22 Aug 2023 16:30:18 +0300 Subject: [PATCH 4/5] update --- pr_agent/settings/pr_code_suggestions_prompts.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pr_agent/settings/pr_code_suggestions_prompts.toml b/pr_agent/settings/pr_code_suggestions_prompts.toml index e32ffd728..a65b546f5 100644 --- a/pr_agent/settings/pr_code_suggestions_prompts.toml +++ b/pr_agent/settings/pr_code_suggestions_prompts.toml @@ -36,7 +36,7 @@ Specific instructions: - Provide the exact line number range (inclusive) for each issue. - Assume there is additional relevant code, that is not included in the diff. - Provide up to {{ num_code_suggestions }} code suggestions. -- Avoid making suggestions that have already been implemented in the PR code. For example, if you propose adding a docstring, type hint, or anything else, make sure it isn't already in the '__new hunk__' code. +- Avoid making suggestions that have already been implemented in the PR code. For example, if you want to add logs or change a variable to const, or anything else, make sure it isn't already in the '__new hunk__' code. - Don't suggest to add docstring or type hints. {%- if extra_instructions %} From 9157fa670e707ea6733d10821818a07132915c2c Mon Sep 17 00:00:00 2001 From: mrT23 Date: Tue, 22 Aug 2023 16:32:22 +0300 Subject: [PATCH 5/5] -> bool --- pr_agent/git_providers/bitbucket_provider.py | 2 +- pr_agent/git_providers/git_provider.py | 2 +- pr_agent/git_providers/github_provider.py | 2 +- pr_agent/git_providers/gitlab_provider.py | 2 +- pr_agent/git_providers/local_git_provider.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pr_agent/git_providers/bitbucket_provider.py b/pr_agent/git_providers/bitbucket_provider.py index 3596f4bf1..85ead79d6 100644 --- a/pr_agent/git_providers/bitbucket_provider.py +++ b/pr_agent/git_providers/bitbucket_provider.py @@ -36,7 +36,7 @@ def get_repo_settings(self): except Exception: return "" - def publish_code_suggestions(self, code_suggestions: list): + def publish_code_suggestions(self, code_suggestions: list) -> bool: """ Publishes code suggestions as comments on the PR. """ diff --git a/pr_agent/git_providers/git_provider.py b/pr_agent/git_providers/git_provider.py index 4d711a14d..c3be8bc3b 100644 --- a/pr_agent/git_providers/git_provider.py +++ b/pr_agent/git_providers/git_provider.py @@ -54,7 +54,7 @@ def publish_inline_comments(self, comments: list[dict]): pass @abstractmethod - def publish_code_suggestions(self, code_suggestions: list): + def publish_code_suggestions(self, code_suggestions: list) -> bool: pass @abstractmethod diff --git a/pr_agent/git_providers/github_provider.py b/pr_agent/git_providers/github_provider.py index be0fa645d..f400b92d1 100644 --- a/pr_agent/git_providers/github_provider.py +++ b/pr_agent/git_providers/github_provider.py @@ -166,7 +166,7 @@ def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_ def publish_inline_comments(self, comments: list[dict]): self.pr.create_review(commit=self.last_commit_id, comments=comments) - def publish_code_suggestions(self, code_suggestions: list): + def publish_code_suggestions(self, code_suggestions: list) -> bool: """ Publishes code suggestions as comments on the PR. """ diff --git a/pr_agent/git_providers/gitlab_provider.py b/pr_agent/git_providers/gitlab_provider.py index ec6f236de..f1fa01191 100644 --- a/pr_agent/git_providers/gitlab_provider.py +++ b/pr_agent/git_providers/gitlab_provider.py @@ -195,7 +195,7 @@ def get_relevant_diff(self, relevant_file: str, relevant_line_in_file: int) -> O f'No relevant diff found for {relevant_file} {relevant_line_in_file}. Falling back to last diff.') return self.last_diff # fallback to last_diff if no relevant diff is found - def publish_code_suggestions(self, code_suggestions: list): + def publish_code_suggestions(self, code_suggestions: list) -> bool: for suggestion in code_suggestions: try: body = suggestion['body'] diff --git a/pr_agent/git_providers/local_git_provider.py b/pr_agent/git_providers/local_git_provider.py index a4f21969a..cf8f38b2e 100644 --- a/pr_agent/git_providers/local_git_provider.py +++ b/pr_agent/git_providers/local_git_provider.py @@ -130,7 +130,7 @@ def publish_code_suggestion(self, body: str, relevant_file: str, relevant_lines_start: int, relevant_lines_end: int): raise NotImplementedError('Publishing code suggestions is not implemented for the local git provider') - def publish_code_suggestions(self, code_suggestions: list): + def publish_code_suggestions(self, code_suggestions: list) -> bool: raise NotImplementedError('Publishing code suggestions is not implemented for the local git provider') def publish_labels(self, labels):