Merge branch 'main' into remove-dataset-formatting-module

behroozazarkhalili · web-flow · commit 1baa79bc8c29 · 2025-10-09T10:06:47.000-07:00
diff --git a/.github/workflows/slow-tests.yml b/.github/workflows/slow-tests.yml
@@ -115,6 +115,4 @@ jobs:
           source .venv/bin/activate
           uv pip install slack_sdk tabulate
           python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
-          python scripts/log_example_reports.py --text_file_name temp_results_sft_tests.txt >> $GITHUB_STEP_SUMMARY
-          python scripts/log_example_reports.py --text_file_name temp_results_dpo_tests.txt >> $GITHUB_STEP_SUMMARY
           rm *.txt
diff --git a/scripts/log_example_reports.py b/scripts/log_example_reports.py
diff --git a/tests/test_judges.py b/tests/test_judges.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
 import time
 
 import pytest
@@ -59,6 +60,9 @@ def load_pair_rm_judge(self):
         raise ValueError("Failed to load PairRMJudge")
 
     @require_llm_blender
+    @pytest.mark.skipif(
+        sys.version_info == (3, 13, 8), reason="Python 3.13.8 has a bug in inspect.BlockFinder (cpython GH-139783)"
+    )
     def test_pair_rm_judge(self):
         judge = self.load_pair_rm_judge()
         prompts, completions = self._get_prompts_and_pairwise_completions()
@@ -68,6 +72,9 @@ def test_pair_rm_judge(self):
         assert ranks == [0, 1]
 
     @require_llm_blender
+    @pytest.mark.skipif(
+        sys.version_info == (3, 13, 8), reason="Python 3.13.8 has a bug in inspect.BlockFinder (cpython GH-139783)"
+    )
     def test_pair_rm_judge_return_scores(self):
         judge = self.load_pair_rm_judge()
         prompts, completions = self._get_prompts_and_pairwise_completions()