Fix listing and download completions cli bug (#240)

* Update fetching completions to not get total * Update download completions logic without using total * Update poetry.lock file * Update test_cli assertion to pass with new changes * Fix test assertion * Keep file open while writing data * Remove --compact from cli_docs * Remove unused code * Add warning message for overwriting file and check extensions * Remove .md and .csv extensions * Update spinner with bouncingbar
log10-io · Jul 31, 2024 · 9626930 · 9626930
1 parent 7611486
commit 9626930
Show file tree

Hide file tree

Showing 5 changed files with 117 additions and 66 deletions.
diff --git a/cli_docs.md b/cli_docs.md
@@ -270,8 +270,6 @@ Options:
   --to [%Y-%m-%d|%Y-%m-%dT%H:%M:%S|%Y-%m-%d %H:%M:%S]
                                   Set the end date for fetching completions
                                   (inclusive). Use the format: YYYY-MM-DD.
-  --compact                       Enable to download only the compact version
-                                  of the output.
   -f, --file TEXT                 Specify the filename and path for the output
                                   file.
 ```

diff --git a/log10/cli/completions.py b/log10/cli/completions.py
@@ -4,18 +4,17 @@
 import click
 import pandas as pd
 import rich
-import tqdm
 from rich.console import Console
 from rich.table import Table
 
 from log10._httpx_utils import _get_time_diff, _try_get
 from log10.cli_utils import generate_markdown_report, generate_results_table
 from log10.completions.completions import (
+    Completions,
     _check_model_support,
     _compare,
     _get_completion,
     _get_completions_url,
-    _write_completions,
 )
 from log10.llm import Log10Config
 from log10.prompt_analyzer import PromptAnalyzer, convert_suggestion_to_markdown, display_prompt_analyzer_suggestions
@@ -24,7 +23,7 @@
 _log10_config = Log10Config()
 
 
-def _render_completions_table(completions_data, total_completions):
+def _render_completions_table(completions_data):
     data_for_table = []
     for completion in completions_data:
         prompt, response = "", ""
@@ -45,9 +44,12 @@ def _render_completions_table(completions_data, total_completions):
                     message = first_choice["message"]
                     response = (
                         message.get("content")
-                        or message.get("tool_calls", [])[-1].get("function", {}).get("arguments", "")
-                        if message.get("tool_calls")
-                        else ""
+                        or (
+                            message.get("tool_calls")[-1].get("function", {}).get("arguments", "")
+                            if message.get("tool_calls")
+                            else ""
+                        )
+                        or ""
                     )
                 elif "function_call" in first_choice:
                     response = json.dumps(first_choice.get("function_call", {}))
@@ -86,7 +88,6 @@ def _render_completions_table(completions_data, total_completions):
 
     console = Console()
     console.print(table)
-    console.print(f"{total_completions=}")
 
 
 def _render_comparison_table(model_response_raw_data):
@@ -184,10 +185,9 @@ def list_completions(limit, offset, timeout, tags, from_date, to_date):
     res = _try_get(url, timeout)
 
     completions = res.json()
-    total_completions = completions["total"]
     completions = completions["data"]
 
-    _render_completions_table(completions, total_completions)
+    _render_completions_table(completions)
 
 
 @click.command()
@@ -201,8 +201,8 @@ def get_completion(id):
 
 
 @click.command()
-@click.option("--limit", default="", help="Specify the maximum number of completions to retrieve.")
-@click.option("--offset", default="", help="Set the starting point (offset) from where to begin fetching completions.")
+@click.option("--limit", default=50, help="Specify the maximum number of completions to retrieve.")
+@click.option("--offset", default=0, help="Set the starting point (offset) from where to begin fetching completions.")
 @click.option(
     "--timeout", default=10, help="Set the maximum time (in seconds) allowed for the HTTP request to complete."
 )
@@ -219,40 +219,69 @@ def get_completion(id):
     type=click.DateTime(),
     help="Set the end date for fetching completions (inclusive). Use the format: YYYY-MM-DD.",
 )
-@click.option("--compact", is_flag=True, help="Enable to download only the compact version of the output.")
-@click.option("--file", "-f", default="completions.jsonl", help="Specify the filename and path for the output file.")
-def download_completions(limit, offset, timeout, tags, from_date, to_date, compact, file):
+@click.option(
+    "--file",
+    "-f",
+    type=click.Path(dir_okay=False),
+    default="completions.jsonl",
+    help="Specify the filename and path for the output file. Only .jsonl extension is supported.",
+)
+def download_completions(limit, offset, timeout, tags, from_date, to_date, file):
     """
     Download completions to a jsonl file
     """
-    base_url = _log10_config.url
-    org_id = _log10_config.org_id
+    input_offset = int(offset)
+    input_limit = int(limit)
+    fetched_total = 0
+    batch_size = 10
 
-    init_url = _get_completions_url(1, 0, tags, from_date, to_date, base_url, org_id)
-    res = _try_get(init_url)
-    if res.status_code != 200:
-        rich.print(f"Error: {res.json()}")
-        return
+    if file:
+        path = Path(file)
+        if path.exists():
+            rich.print(f'Warning: The file "{file}" already exists and will be overwritten.')
 
-    total_completions = res.json()["total"]
-    offset = int(offset) if offset else 0
-    limit = int(limit) if limit else total_completions
-    rich.print(f"Download total completions: {limit}/{total_completions}")
-    if not click.confirm("Do you want to continue?"):
+        ext_name = path.suffix.lower()
+        if ext_name not in [".jsonl"]:
+            raise click.UsageError(f"Only .jsonl extension is supported for the output file. Got: {ext_name}")
+
+    console = Console()
+    track_limit = input_limit if input_limit < batch_size else batch_size
+    track_offset = input_offset
+    try:
+        with console.status("[bold green]Downloading completions...", spinner="bouncingBar") as _status:
+            with open(file, "w") as output_file:
+                while True and track_limit > 0:
+                    new_data = Completions()._get_completions(
+                        offset=track_offset,
+                        limit=track_limit,
+                        timeout=timeout,
+                        tag_names=tags,
+                        from_date=from_date,
+                        to_date=to_date,
+                    )
+
+                    new_data_size = len(new_data)
+                    fetched_total += new_data_size
+
+                    for completion in new_data:
+                        output_file.write(json.dumps(completion) + "\n")
+
+                    console.print(f"Downloaded {fetched_total} completions to {file}.")
+
+                    if new_data_size == 0 or new_data_size < track_limit:
+                        break
+
+                    track_offset += new_data_size
+                    track_limit = (
+                        input_limit - fetched_total if input_limit - fetched_total < batch_size else batch_size
+                    )
+    except Exception as e:
+        rich.print(f"Error fetching completions {e}")
+        if hasattr(e, "response") and hasattr(e.response, "json") and "error" in e.response.json():
+            rich.print(e.response.json()["error"])
         return
 
-    # dowlnoad completions
-    pbar = tqdm.tqdm(total=limit)
-    batch_size = 10
-    end = offset + limit if offset + limit < total_completions else total_completions
-    for batch in range(offset, end, batch_size):
-        current_batch_size = batch_size if batch + batch_size < end else end - batch
-        download_url = _get_completions_url(
-            current_batch_size, batch, tags, from_date, to_date, base_url, org_id, printout=False
-        )
-        res = _try_get(download_url, timeout)
-        _write_completions(res, file, compact)
-        pbar.update(current_batch_size)
+    rich.print(f"Download total completions: {fetched_total}. Saved to {file}")
 
 
 @click.command()

diff --git a/log10/completions/completions.py b/log10/completions/completions.py
@@ -1,5 +1,7 @@
 import json
+import logging
 import time
+from typing import List, Optional
 
 import click
 import httpx
@@ -8,6 +10,14 @@
 from log10.llm import Log10Config
 
 
+logging.basicConfig(
+    format="[%(asctime)s - %(name)s - %(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger: logging.Logger = logging.getLogger("LOG10")
+logger.setLevel(logging.INFO)
+
+
 _log10_config = Log10Config()
 
 
@@ -75,19 +85,6 @@ def _get_valid_date_range(from_date, to_date):
     return date_range
 
 
-def _write_completions(res, output_file, compact_mode):
-    """Processes completions and appends them to the output file."""
-    with open(output_file, "a") as file:
-        data = res.json()["data"]
-        if compact_mode:
-            for completion in data:
-                file.write(json.dumps(completion) + "\n")
-        else:
-            for completion_id in (completion["id"] for completion in data):
-                completion = _get_completion(completion_id).json()["data"]
-                file.write(json.dumps(completion) + "\n")
-
-
 def _get_llm_repsone(
     model: str,
     messages: list[dict],
@@ -203,3 +200,41 @@ def _compare(models: list[str], messages: dict, temperature: float = 0.2, max_to
 
 def _check_model_support(model: str) -> bool:
     return model in _SUPPORTED_MODELS
+
+
+class Completions:
+    completions_path = "/api/completions"
+
+    def __init__(self, log10_config: Log10Config = None):
+        self._log10_config = log10_config or Log10Config()
+        self._http_client = httpx.Client()
+        self._http_client.headers = {
+            "x-log10-token": self._log10_config.token,
+            "x-log10-organization-id": self._log10_config.org_id,
+            "Content-Type": "application/json",
+        }
+
+        self.org_id = self._log10_config.org_id
+        self.base_url = self._log10_config.url
+        self.url = f"{self.base_url}{self.completions_path}?organization_id={self.org_id}"
+
+    def _get_completions(
+        self,
+        offset: int,
+        limit: int,
+        timeout: int,
+        tag_names: Optional[List[str]] = None,
+        from_date: click.DateTime = None,
+        to_date: click.DateTime = None,
+        printout: bool = True,
+    ) -> List[dict]:
+        url = _get_completions_url(limit, offset, tag_names, from_date, to_date, self.base_url, self.org_id)
+        # Fetch completions
+        response = _try_get(url, timeout)
+
+        if response.status_code != 200:
+            logger.error(f"Error: {response.json()}")
+            return
+
+        completions = response.json()
+        return completions["data"]
diff --git a/poetry.lock b/poetry.lock
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -18,7 +18,6 @@ def test_list_completions(runner):
     result = runner.invoke(cli, ["completions", "list"])
     print(result.output)
     assert result.exit_code == 0
-    assert "total_completions=" in result.output
 
 
 def test_get_completion(runner):
@@ -30,7 +29,7 @@ def test_get_completion(runner):
 def test_download_completions(runner):
     result = runner.invoke(cli, ["completions", "download", "--limit", "1", "--tags", "log10/summary-grading"])
     assert result.exit_code == 0
-    assert "Download total completions: 1/" in result.output
+    assert "Download total completions: 1. Saved to completions.jsonl" in result.output
 
 
 def test_benchmark_models_with_ids(runner):