address comments of DouweM in the data analyst example

tonyxwz · tonyxwz · commit c65c67a38006 · 2025-07-29T20:25:26.000+02:00
diff --git a/examples/pydantic_ai_examples/data_analyst.py b/examples/pydantic_ai_examples/data_analyst.py
@@ -1,25 +1,30 @@
 from dataclasses import dataclass
 
+import datasets
+import duckdb
+import pandas as pd
 from devtools import debug
 
 from pydantic_ai import Agent, ModelRetry, RunContext
 
-try:
-    import datasets
-    import duckdb
-    import pandas as pd
-except ImportError as e:
-    raise ImportError(
-        'Please install both duckdb and pandas.\n'
-        '- pip: `pip install duckdb pandas\n'
-        '- uv: `uv pip install duckdb pandas'
-    ) from e
-
 
 @dataclass
 class AnalystAgentDeps:
     output: dict[str, pd.DataFrame]
 
+    def store(self, value: pd.DataFrame) -> str:
+        """Store the output in deps and return the reference such as Out[1] to be used by the LLM."""
+        ref = f'Out[{len(self.output) + 1}]'
+        self.output[ref] = value
+        return ref
+
+    def get(self, ref: str) -> pd.DataFrame:
+        if ref not in self.output:
+            raise ModelRetry(
+                f'Error: {ref} is not a valid variable reference. Check the previous messages and try again.'
+            )
+        return self.output[ref]
+
 
 analyst_agent = Agent(
     'openai:gpt-4o',
@@ -41,6 +46,7 @@ def load_dataset(
         path: name of the dataset in the form of `<user_name>/<dataset_name>`
         split: load the split of the dataset (default: "train")
     """
+    # begin load data from hf
     builder = datasets.load_dataset_builder(path)  # pyright: ignore[reportUnknownMemberType]
     splits: dict[str, datasets.SplitInfo] = builder.info.splits or {}  # pyright: ignore[reportUnknownMemberType]
     if split not in splits:
@@ -53,14 +59,19 @@ def load_dataset(
     assert isinstance(dataset, datasets.Dataset)
     dataframe = dataset.to_pandas()
     assert isinstance(dataframe, pd.DataFrame)
-    ref = f'Out[{len(ctx.deps.output) + 1}]'
-    ctx.deps.output[ref] = dataframe
-    output = [f'Loaded the dataset as `{ref}`.']
-    if dataset.info.description:
-        output.append(f'Description: {dataset.info.description}')
-    if dataset.info.features:
-        output.append(f'Features: {dataset.info.features!r}')
-    return '\n'.join(output)
+    # end load data from hf
+
+    # store the dataframe in the deps and get a ref like "Out[1]"
+    ref = ctx.deps.store(dataframe)
+    # construct a summary of the loaded dataset
+    output = [
+        f'Loaded the dataset as `{ref}`.',
+        f'Description: {dataset.info.description}'
+        if dataset.info.description
+        else None,
+        f'Features: {dataset.info.features!r}' if dataset.info.features else None,
+    ]
+    return '\n'.join(filter(None, output))
 
 
 @analyst_agent.tool
@@ -76,15 +87,15 @@ def run_duckdb(ctx: RunContext[AnalystAgentDeps], dataset: str, sql: str) -> str
     """
     data = ctx.deps.output[dataset]
     result = duckdb.query_df(df=data, virtual_table_name='dataset', sql_query=sql)
-    ref = f'Out[{len(ctx.deps.output) + 1}]'
-    ctx.deps.output[ref] = result.df()  # pyright: ignore[reportUnknownMemberType]
+    # pass the result as ref (because DuckDB SQL can select many rows, creating another huge dataframe)
+    ref = ctx.deps.store(result.df())  # pyright: ignore[reportUnknownMemberType]
     return f'Executed SQL, result is `{ref}`'
 
 
 @analyst_agent.tool
 def display(ctx: RunContext[AnalystAgentDeps], name: str) -> str:
-    """Display the dataframe at most 5 rows."""
-    dataset = ctx.deps.output[name]
+    """Display at most 5 rows of the dataframe ."""
+    dataset = ctx.deps.get(name)
     return dataset.head().to_string()  # pyright: ignore[reportUnknownMemberType]