Codium-ai · mrT23 · Jan 29, 2024 · Jan 29, 2024 · Jan 29, 2024 · Jan 29, 2024
diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py
@@ -181,7 +181,7 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
  ...
  """
 
- patch_with_lines_str = f"\n\n## {file.filename}\n"
+ patch_with_lines_str = f"\n\n## file: '{file.filename.strip()}'\n"
  patch_lines = patch.splitlines()
  RE_HUNK_HEADER = re.compile(
  r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
@@ -202,11 +202,11 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
  if new_content_lines:
  if prev_header_line:
  patch_with_lines_str += f'\n{prev_header_line}\n'
- patch_with_lines_str += '__new hunk__\n'
+ patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__new hunk__\n'
  for i, line_new in enumerate(new_content_lines):
  patch_with_lines_str += f"{start2 + i} {line_new}\n"
  if old_content_lines:
- patch_with_lines_str += '__old hunk__\n'
+ patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__old hunk__\n'
  for line_old in old_content_lines:
  patch_with_lines_str += f"{line_old}\n"
  new_content_lines = []
@@ -236,11 +236,11 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
  if match and new_content_lines:
  if new_content_lines:
  patch_with_lines_str += f'\n{header_line}\n'
- patch_with_lines_str += '\n__new hunk__\n'
+ patch_with_lines_str = patch_with_lines_str.rstrip()+ '\n__new hunk__\n'
  for i, line_new in enumerate(new_content_lines):
  patch_with_lines_str += f"{start2 + i} {line_new}\n"
  if old_content_lines:
- patch_with_lines_str += '\n__old hunk__\n'
+ patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
  for line_old in old_content_lines:
  patch_with_lines_str += f"{line_old}\n"
 

diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py
@@ -209,9 +209,9 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo
 
  if patch:
  if not convert_hunks_to_line_numbers:
- patch_final = f"## {file.filename}\n\n{patch}\n"
+ patch_final = f"\n\n## file: '{file.filename.strip()}\n\n{patch.strip()}\n'"
  else:
- patch_final = patch
+ patch_final = "\n\n" + patch.strip()
  patches.append(patch_final)
  total_tokens += token_handler.count_tokens(patch_final)
  if get_settings().config.verbosity_level >= 2:
@@ -375,6 +375,13 @@ def get_pr_multi_diffs(git_provider: GitProvider,
  for lang in pr_languages:
  sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True))
 
+
+ # try first a single run with standard diff string, with patch extension, and no deletions
+ patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff(
+ pr_languages, token_handler, add_line_numbers_to_hunks=True)
+ if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < get_max_tokens(model):
+ return ["\n".join(patches_extended)]
+
  patches = []
  final_diff_list = []
  total_tokens = token_handler.prompt_tokens

diff --git a/pr_agent/settings/pr_add_docs.toml b/pr_agent/settings/pr_add_docs.toml
@@ -5,7 +5,7 @@ Your task is to generate {{ docs_for_language }} for code components in the PR D
 
 Example for the PR Diff format:
 ======
-## src/file1.py
+## file: 'src/file1.py'
 
 @@ -12,3 +12,4 @@ def func1():
 __new hunk__
@@ -18,15 +18,14 @@ __old hunk__
 -code line that was removed in the PR
  code line2 that remained unchanged in the PR
 
-
 @@ ... @@ def func2():
 __new hunk__
 ...
 __old hunk__
 ...
 
 
-## src/file2.py
+## file: 'src/file2.py'
 ...
 ======
 

diff --git a/pr_agent/settings/pr_code_suggestions_prompts.toml b/pr_agent/settings/pr_code_suggestions_prompts.toml
@@ -4,7 +4,7 @@ Your task is to provide meaningful and actionable code suggestions, to improve t
 
 Example for the PR Diff format:
 ======
-## src/file1.py
+## file: 'src/file1.py'
 
 @@ ... @@ def func1():
 __new hunk__
@@ -16,15 +16,14 @@ __old hunk__
 -old code line2 that was removed in the PR
  code line3 that remained unchanged in the PR
 
-
 @@ ... @@ def func2():
 __new hunk__
 ...
 __old hunk__
 ...
 
 
-## src/file2.py
+## file: 'src/file2.py'
 ...
 ======
 
@@ -51,6 +50,7 @@ The output must be a YAML object equivalent to type $PRCodeSuggestions, accordin
 =====
 class CodeSuggestion(BaseModel):
  relevant_file: str = Field(description="the relevant file full path")
+ language: str = Field(description="the code language of the relevant file")
  suggestion_content: str = Field(description="an actionable suggestion for meaningfully improving the new code introduced in the PR")
 {%- if summarize_mode %}
  existing_code: str = Field(description="a short code snippet from a '__new hunk__' section to illustrate the relevant existing code. Don't show the line numbers.")
@@ -74,6 +74,8 @@ Example output:
 code_suggestions:
 - relevant_file: |-
  src/file1.py
+ language: |-
+ python
  suggestion_content: |-
  Add a docstring to func1()
 {%- if summarize_mode %}
@@ -105,11 +107,6 @@ user="""PR Info:
 
 Title: '{{title}}'
 
-{%- if language %}
-
-Main PR language: '{{ language }}'
-{%- endif %}
-
 
 The PR Diff:
 ======

diff --git a/pr_agent/settings/pr_description_prompts.toml b/pr_agent/settings/pr_description_prompts.toml
@@ -39,6 +39,7 @@ class PRType(str, Enum):
 
 Class FileDescription(BaseModel):
  filename: str = Field(description="the relevant file full path")
+ language: str = Field(description="the relevant file language")
  changes_summary: str = Field(description="concise summary of the changes in the relevant file, in bullet points (1-4 bullet points).")
  changes_title: str = Field(description="an informative title for the changes in the files, describing its main theme (5-10 words).")
  label: str = Field(description="a single semantic label that represents a type of code changes that occurred in the File. Possible values (partial list): 'bug fix', 'tests', 'enhancement', 'documentation', 'error handling', 'configuration changes', 'dependencies', 'formatting', 'miscellaneous', ...")
@@ -67,6 +68,8 @@ type:
 pr_files:
 - filename: |
  ...
+ language: |
+ ...
  changes_summary: |
  ...
  changes_title: |
@@ -104,10 +107,7 @@ Previous description:
 {%- endif %}
 
 Branch: '{{branch}}'
-{%- if language %}
 
-Main PR language: '{{ language }}'
-{%- endif %}
 {%- if commit_messages_str %}
 
 Commit messages:

diff --git a/pr_agent/settings/pr_reviewer_prompts.toml b/pr_agent/settings/pr_reviewer_prompts.toml
@@ -5,7 +5,7 @@ The review should focus on new code added in the PR diff (lines starting with '+
 
 Example PR Diff:
 ======
-## src/file1.py
+## file: 'src/file1.py'
 
 @@ -12,5 +12,5 @@ def func1():
 code line 1 that remained unchanged in the PR
@@ -14,12 +14,11 @@ code line 2 that remained unchanged in the PR
 +code line added in the PR
 code line 3 that remained unchanged in the PR
 
-
 @@ ... @@ def func2():
 ...
 
 
-## src/file2.py
+## file: 'src/file2.py'
 ...
 ======
 
@@ -115,6 +114,9 @@ PR Feedback:
  relevant file:
  type: string
  description: the relevant file full path
+ language:
+ type: string
+ description: the language of the relevant file
  suggestion:
  type: string
  description: |-
@@ -166,6 +168,8 @@ PR Feedback:
  Code feedback:
  - relevant file: |-
  directory/xxx.py
+ language: |-
+ python
  suggestion: |-
  xxx [important]
  relevant line: |-
@@ -195,10 +199,6 @@ Description:
 ======
 {%- endif %}
 
-{%- if language %}
-
-Main PR language: '{{ language }}'
-{%- endif %}
 {%- if commit_messages_str %}
 
 Commit messages:

diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py
@@ -226,7 +226,7 @@ async def _prepare_prediction_extended(self, model: str) -> dict:
  for i, patches_diff in enumerate(patches_diff_list):
  get_logger().info(f"Processing chunk {i + 1} of {len(patches_diff_list)}")
  self.patches_diff = patches_diff
- prediction = await self._get_prediction(model)
+ prediction = await self._get_prediction(model) # toDo: parallelize
  prediction_list.append(prediction)
  self.prediction_list = prediction_list
 
@@ -253,10 +253,15 @@ async def rank_suggestions(self, data: List) -> List:
  """
 
  suggestion_list = []
+ if not data:
+ return suggestion_list
  for suggestion in data:
  suggestion_list.append(suggestion)
  data_sorted = [[]] * len(suggestion_list)
 
+ if len(suggestion_list ) == 1:
+ return suggestion_list
+
  try:
  suggestion_str = ""
  for i, suggestion in enumerate(suggestion_list):