From 6565556e0162cb737d740338da15179ad6d34f6c Mon Sep 17 00:00:00 2001
From: mrT23 <tal.r@codium.ai>
Date: Mon, 29 Jan 2024 20:51:24 +0200
Subject: [PATCH 1/3] feat: Add 'language' field to CodeSuggestion,
 FileDescription, and ReviewerPrompt models in settings files

---
 pr_agent/settings/pr_code_suggestions_prompts.toml | 8 +++-----
 pr_agent/settings/pr_description_prompts.toml      | 6 +++---
 pr_agent/settings/pr_reviewer_prompts.toml         | 9 +++++----
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/pr_agent/settings/pr_code_suggestions_prompts.toml b/pr_agent/settings/pr_code_suggestions_prompts.toml
index 2fb224c75..ee3110544 100644
--- a/pr_agent/settings/pr_code_suggestions_prompts.toml
+++ b/pr_agent/settings/pr_code_suggestions_prompts.toml
@@ -51,6 +51,7 @@ The output must be a YAML object equivalent to type $PRCodeSuggestions, accordin
 =====
 class CodeSuggestion(BaseModel):
     relevant_file: str = Field(description="the relevant file full path")
+    language: str = Field(description="the code language of the relevant file")
     suggestion_content: str = Field(description="an actionable suggestion for meaningfully improving the new code introduced in the PR")
 {%- if summarize_mode %}
     existing_code: str = Field(description="a short code snippet from a '__new hunk__' section to illustrate the relevant existing code. Don't show the line numbers.")
@@ -74,6 +75,8 @@ Example output:
 code_suggestions:
 - relevant_file: |-
     src/file1.py
+  language: |-
+    python
   suggestion_content: |-
     Add a docstring to func1()
 {%- if summarize_mode %}
@@ -105,11 +108,6 @@ user="""PR Info:
 
 Title: '{{title}}'
 
-{%- if language %}
-
-Main PR language: '{{ language }}'
-{%- endif %}
-
 
 The PR Diff:
 ======
diff --git a/pr_agent/settings/pr_description_prompts.toml b/pr_agent/settings/pr_description_prompts.toml
index b9c5ce394..b36b01834 100644
--- a/pr_agent/settings/pr_description_prompts.toml
+++ b/pr_agent/settings/pr_description_prompts.toml
@@ -39,6 +39,7 @@ class PRType(str, Enum):
 
 Class FileDescription(BaseModel):
     filename: str = Field(description="the relevant file full path")
+    language: str = Field(description="the relevant file language")
     changes_summary: str = Field(description="concise summary of the changes in the relevant file, in bullet points (1-4 bullet points).")
     changes_title: str = Field(description="an informative title for the changes in the files, describing its main theme (5-10 words).")
     label: str = Field(description="a single semantic label that represents a type of code changes that occurred in the File. Possible values (partial list): 'bug fix', 'tests', 'enhancement', 'documentation', 'error handling', 'configuration changes', 'dependencies', 'formatting', 'miscellaneous', ...")
@@ -67,6 +68,8 @@ type:
 pr_files:
 - filename: |
     ...
+  language: |
+    ...
   changes_summary: |
     ...
   changes_title: |
@@ -104,10 +107,7 @@ Previous description:
 {%- endif %}
 
 Branch: '{{branch}}'
-{%- if language %}
 
-Main PR language: '{{ language }}'
-{%- endif %}
 {%- if commit_messages_str %}
 
 Commit messages:
diff --git a/pr_agent/settings/pr_reviewer_prompts.toml b/pr_agent/settings/pr_reviewer_prompts.toml
index 736fb2472..5312832c6 100644
--- a/pr_agent/settings/pr_reviewer_prompts.toml
+++ b/pr_agent/settings/pr_reviewer_prompts.toml
@@ -115,6 +115,9 @@ PR Feedback:
       relevant file:
         type: string
         description: the relevant file full path
+      language:
+        type: string
+        description: the language of the relevant file
       suggestion:
         type: string
         description: |-
@@ -166,6 +169,8 @@ PR Feedback:
   Code feedback:
     - relevant file: |-
         directory/xxx.py
+      language: |-
+        python
       suggestion: |-
         xxx [important]
       relevant line: |-
@@ -195,10 +200,6 @@ Description:
 ======
 {%- endif %}
 
-{%- if language %}
-
-Main PR language: '{{ language }}'
-{%- endif %}
 {%- if commit_messages_str %}
 
 Commit messages:

From 0d867797990324964251c754d4aae38115d0fddc Mon Sep 17 00:00:00 2001
From: mrT23 <tal.r@codium.ai>
Date: Mon, 29 Jan 2024 21:52:54 +0200
Subject: [PATCH 2/3] feat: Improve patch formatting and handle empty data in
 pr_code_suggestions.py

---
 pr_agent/algo/git_patch_processing.py              | 10 +++++-----
 pr_agent/algo/pr_processing.py                     |  4 ++--
 pr_agent/settings/pr_add_docs.toml                 |  5 ++---
 pr_agent/settings/pr_code_suggestions_prompts.toml |  5 ++---
 pr_agent/settings/pr_reviewer_prompts.toml         |  5 ++---
 pr_agent/tools/pr_code_suggestions.py              |  2 ++
 6 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/pr_agent/algo/git_patch_processing.py b/pr_agent/algo/git_patch_processing.py
index 480387fa9..10d140b06 100644
--- a/pr_agent/algo/git_patch_processing.py
+++ b/pr_agent/algo/git_patch_processing.py
@@ -181,7 +181,7 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
            ...
     """
     
-    patch_with_lines_str = f"\n\n## {file.filename}\n"
+    patch_with_lines_str = f"\n\n## file: '{file.filename.strip()}'\n"
     patch_lines = patch.splitlines()
     RE_HUNK_HEADER = re.compile(
         r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)")
@@ -202,11 +202,11 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
                 if new_content_lines:
                     if prev_header_line:
                         patch_with_lines_str += f'\n{prev_header_line}\n'
-                    patch_with_lines_str += '__new hunk__\n'
+                    patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__new hunk__\n'
                     for i, line_new in enumerate(new_content_lines):
                         patch_with_lines_str += f"{start2 + i} {line_new}\n"
                 if old_content_lines:
-                    patch_with_lines_str += '__old hunk__\n'
+                    patch_with_lines_str = patch_with_lines_str.rstrip()+'\n__old hunk__\n'
                     for line_old in old_content_lines:
                         patch_with_lines_str += f"{line_old}\n"
                 new_content_lines = []
@@ -236,11 +236,11 @@ def convert_to_hunks_with_lines_numbers(patch: str, file) -> str:
     if match and new_content_lines:
         if new_content_lines:
             patch_with_lines_str += f'\n{header_line}\n'
-            patch_with_lines_str += '\n__new hunk__\n'
+            patch_with_lines_str = patch_with_lines_str.rstrip()+ '\n__new hunk__\n'
             for i, line_new in enumerate(new_content_lines):
                 patch_with_lines_str += f"{start2 + i} {line_new}\n"
         if old_content_lines:
-            patch_with_lines_str += '\n__old hunk__\n'
+            patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n'
             for line_old in old_content_lines:
                 patch_with_lines_str += f"{line_old}\n"
 
diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py
index ecec30151..f4ffae89e 100644
--- a/pr_agent/algo/pr_processing.py
+++ b/pr_agent/algo/pr_processing.py
@@ -209,9 +209,9 @@ def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, mo
 
         if patch:
             if not convert_hunks_to_line_numbers:
-                patch_final = f"## {file.filename}\n\n{patch}\n"
+                patch_final = f"\n\n## file: '{file.filename.strip()}\n\n{patch.strip()}\n'"
             else:
-                patch_final = patch
+                patch_final = "\n\n" + patch.strip()
             patches.append(patch_final)
             total_tokens += token_handler.count_tokens(patch_final)
             if get_settings().config.verbosity_level >= 2:
diff --git a/pr_agent/settings/pr_add_docs.toml b/pr_agent/settings/pr_add_docs.toml
index 44e9f0910..cc33eee5b 100644
--- a/pr_agent/settings/pr_add_docs.toml
+++ b/pr_agent/settings/pr_add_docs.toml
@@ -5,7 +5,7 @@ Your task is to generate {{ docs_for_language }} for code components in the PR D
 
 Example for the PR Diff format:
 ======
-## src/file1.py
+## file: 'src/file1.py'
 
 @@ -12,3 +12,4 @@ def func1():
 __new hunk__
@@ -18,7 +18,6 @@ __old hunk__
 -code line that was removed in the PR
  code line2 that remained unchanged in the PR
 
-
 @@ ... @@ def func2():
 __new hunk__
 ...
@@ -26,7 +25,7 @@ __old hunk__
 ...
 
 
-## src/file2.py
+## file: 'src/file2.py'
 ...
 ======
 
diff --git a/pr_agent/settings/pr_code_suggestions_prompts.toml b/pr_agent/settings/pr_code_suggestions_prompts.toml
index ee3110544..71d923505 100644
--- a/pr_agent/settings/pr_code_suggestions_prompts.toml
+++ b/pr_agent/settings/pr_code_suggestions_prompts.toml
@@ -4,7 +4,7 @@ Your task is to provide meaningful and actionable code suggestions, to improve t
 
 Example for the PR Diff format:
 ======
-## src/file1.py
+## file: 'src/file1.py'
 
 @@ ... @@ def func1():
 __new hunk__
@@ -16,7 +16,6 @@ __old hunk__
 -old code line2 that was removed in the PR
  code line3 that remained unchanged in the PR
 
-
 @@ ... @@ def func2():
 __new hunk__
 ...
@@ -24,7 +23,7 @@ __old hunk__
 ...
 
 
-## src/file2.py
+## file: 'src/file2.py'
 ...
 ======
 
diff --git a/pr_agent/settings/pr_reviewer_prompts.toml b/pr_agent/settings/pr_reviewer_prompts.toml
index 5312832c6..427cd9743 100644
--- a/pr_agent/settings/pr_reviewer_prompts.toml
+++ b/pr_agent/settings/pr_reviewer_prompts.toml
@@ -5,7 +5,7 @@ The review should focus on new code added in the PR diff (lines starting with '+
 
 Example PR Diff:
 ======
-## src/file1.py
+## file: 'src/file1.py'
 
 @@ -12,5 +12,5 @@ def func1():
 code line 1 that remained unchanged in the PR
@@ -14,12 +14,11 @@ code line 2 that remained unchanged in the PR
 +code line added in the PR
 code line 3 that remained unchanged in the PR
 
-
 @@ ... @@ def func2():
 ...
 
 
-## src/file2.py
+## file: 'src/file2.py'
 ...
 ======
 
diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py
index 1456f9a61..08074899e 100644
--- a/pr_agent/tools/pr_code_suggestions.py
+++ b/pr_agent/tools/pr_code_suggestions.py
@@ -253,6 +253,8 @@ async def rank_suggestions(self, data: List) -> List:
         """
 
         suggestion_list = []
+        if not data:
+            return suggestion_list
         for suggestion in data:
             suggestion_list.append(suggestion)
         data_sorted = [[]] * len(suggestion_list)

From 15c8fe94bb29963cf12e77c046834c0c547da810 Mon Sep 17 00:00:00 2001
From: mrT23 <tal.r@codium.ai>
Date: Mon, 29 Jan 2024 22:00:11 +0200
Subject: [PATCH 3/3] feat: Improve patch formatting and handle empty data in
 pr_code_suggestions.py

---
 pr_agent/algo/pr_processing.py        | 7 +++++++
 pr_agent/tools/pr_code_suggestions.py | 5 ++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/pr_agent/algo/pr_processing.py b/pr_agent/algo/pr_processing.py
index f4ffae89e..1e482dbfb 100644
--- a/pr_agent/algo/pr_processing.py
+++ b/pr_agent/algo/pr_processing.py
@@ -375,6 +375,13 @@ def get_pr_multi_diffs(git_provider: GitProvider,
     for lang in pr_languages:
         sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True))
 
+
+    # try first a single run with standard diff string, with patch extension, and no deletions
+    patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff(
+        pr_languages, token_handler, add_line_numbers_to_hunks=True)
+    if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < get_max_tokens(model):
+        return ["\n".join(patches_extended)]
+
     patches = []
     final_diff_list = []
     total_tokens = token_handler.prompt_tokens
diff --git a/pr_agent/tools/pr_code_suggestions.py b/pr_agent/tools/pr_code_suggestions.py
index 08074899e..381c02a65 100644
--- a/pr_agent/tools/pr_code_suggestions.py
+++ b/pr_agent/tools/pr_code_suggestions.py
@@ -226,7 +226,7 @@ async def _prepare_prediction_extended(self, model: str) -> dict:
         for i, patches_diff in enumerate(patches_diff_list):
             get_logger().info(f"Processing chunk {i + 1} of {len(patches_diff_list)}")
             self.patches_diff = patches_diff
-            prediction = await self._get_prediction(model)
+            prediction = await self._get_prediction(model) # toDo: parallelize
             prediction_list.append(prediction)
         self.prediction_list = prediction_list
 
@@ -259,6 +259,9 @@ async def rank_suggestions(self, data: List) -> List:
             suggestion_list.append(suggestion)
         data_sorted = [[]] * len(suggestion_list)
 
+        if len(suggestion_list ) == 1:
+            return suggestion_list
+
         try:
             suggestion_str = ""
             for i, suggestion in enumerate(suggestion_list):