From fa134cdd7af2b4b9841494ee808003a0a78710f4 Mon Sep 17 00:00:00 2001
From: Robert Haase <haesleinhuepf@users.noreply.github.com>
Date: Tue, 31 Dec 2024 00:08:47 +0100
Subject: [PATCH] add multi-agent example

---
 docs/66_arxiv_agent/arxiv_utilities.py        |  40 +-
 .../multiagent_write_review.ipynb             | 641 ++++++++++++++++++
 docs/_toc.yml                                 |   3 +-
 3 files changed, 682 insertions(+), 2 deletions(-)
 create mode 100644 docs/66_arxiv_agent/multiagent_write_review.ipynb

diff --git a/docs/66_arxiv_agent/arxiv_utilities.py b/docs/66_arxiv_agent/arxiv_utilities.py
index 7141e66..7878ad6 100644
--- a/docs/66_arxiv_agent/arxiv_utilities.py
+++ b/docs/66_arxiv_agent/arxiv_utilities.py
@@ -66,6 +66,23 @@ def check_if_arxiv_paper_is_cc_by_40(link):
     license_response = requests.get(link)        
     return license_response.status_code == 200 and "https://arxiv.org/icons/licenses/by-4.0.png" in license_response.text
 
+def get_arxiv_metadata(arxiv_id):
+    import arxiv
+    search = arxiv.Search(id_list=[arxiv_id])
+    paper = next(search.results())
+    
+    metadata = {
+        'title': paper.title,
+        'authors': [author.name for author in paper.authors],
+        'published': paper.published,
+        'summary': paper.summary,
+        'doi': paper.doi,
+        'primary_category': paper.primary_category,
+        'categories': paper.categories,
+        'pdf_url': paper.pdf_url
+    }
+    return metadata
+    
 def convert_to_markdown(papers):
     md_content = "# Search Results\n\n"
     for idx, paper in enumerate(papers, start=1):
@@ -223,4 +240,25 @@ def make_powerpoint_slides(slides:str, filename="slides.pptx"):
 
     return f"The Powerpoint Presentation was saved as [{filename}]({filename})"
 
-
+def prompt_scadsai_llm(message:str, model="meta-llama/Llama-3.3-70B-Instruct"):
+    """A prompt helper function that sends a message to ScaDS.AI LLM server at 
+    ZIH TU Dresden and returns only the text response.
+    """
+    import os
+    import openai
+    
+    # convert message in the right format if necessary
+    if isinstance(message, str):
+        message = [{"role": "user", "content": message}]
+    
+    # setup connection to the LLM
+    client = openai.OpenAI(base_url="https://llm.scads.ai/v1",
+                           api_key=os.environ.get('SCADSAI_API_KEY')
+    )
+    response = client.chat.completions.create(
+        model=model,
+        messages=message
+    )
+    
+    # extract answer
+    return response.choices[0].message.content
diff --git a/docs/66_arxiv_agent/multiagent_write_review.ipynb b/docs/66_arxiv_agent/multiagent_write_review.ipynb
new file mode 100644
index 0000000..9058adc
--- /dev/null
+++ b/docs/66_arxiv_agent/multiagent_write_review.ipynb
@@ -0,0 +1,641 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "617d3708-b2ef-456c-9283-29adf3ce7e14",
+   "metadata": {},
+   "source": [
+    "# A multi-agent system writing a manuscript\n",
+    "\n",
+    "In this notebook we will use a multi-agent system to write a review about some arxiv papers. We will use [smolagents](https://github.com/huggingface/smolagents) to define agents with different responsibilities:\n",
+    "* A research assistant which can read arxiv papers and write manuscript sumaries.\n",
+    "* A reviewer which will provide constructive feedback given a text.\n",
+    "* A scientific writer which will incorporate the feedback and write a final manuscript.\n",
+    "* A printer which will print the final manuscript.\n",
+    "* A scheduler which distributes tasks to the other agents.\n",
+    "\n",
+    "**Note:** For technical reasons, we only read the abstract. \n",
+    "Of course, as in read life, it would be better to read the entire paper, but this exceeds token limits of SOTA open-weight LLMs.\n",
+    "\n",
+    "This notebook will only work with an upcoming release of smolagents (> 0.1.3), after this PR is merged: https://github.com/huggingface/smolagents/pull/12"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a53898ba-8cbe-44b3-8268-82f3fe92492c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\rober\\miniforge3\\envs\\genai-gpu\\Lib\\site-packages\\pydantic\\_internal\\_config.py:345: UserWarning: Valid config keys have changed in V2:\n",
+      "* 'fields' has been removed\n",
+      "  warnings.warn(message, UserWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from IPython.display import display, Markdown\n",
+    "from smolagents.agents import ToolCallingAgent, CodeAgent\n",
+    "from smolagents import tool, LiteLLMModel\n",
+    "import os\n",
+    "\n",
+    "# these functions are defined in arxiv_utilities.py\n",
+    "from arxiv_utilities import prompt_scadsai_llm, get_arxiv_metadata"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "198a9dd9-4222-4210-adc3-56d1d581540d",
+   "metadata": {},
+   "source": [
+    "We configure to use a model provided on our institutional LLM server."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "22c30391-42b9-4055-b872-ca9a67605bac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model =    \"openai/meta-llama/Llama-3.3-70B-Instruct\"\n",
+    "api_base = \"https://llm.scads.ai/v1\"\n",
+    "api_key =  os.environ.get('SCADSAI_API_KEY')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7ee8ef8f-07dd-4b56-b7c0-4a6db446bb46",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = prompt_scadsai_llm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "6b1046ed-28a3-498d-980d-be94d6ecdfac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# comment this to show detailed output\n",
+    "from smolagents.utils import console\n",
+    "console.quiet = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "56be0f58-fd7a-4b8e-accc-f578a42dcfce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "verbose = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da008a82-2567-46c2-beed-49d8bd6bd009",
+   "metadata": {},
+   "source": [
+    "First, we define a function that generates a new agent with given name, description, system message, etc."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "fb15e205-29d7-4ab7-9571-285798937f8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_agent(name, description, tools, model, api_base=None, api_key=None, system_message=None):\n",
+    "    \"\"\"Create an agent that uses a given list of tools according to its system message.\"\"\"\n",
+    "    model = LiteLLMModel(model_id=model, \n",
+    "                         api_base=api_base, \n",
+    "                         api_key=api_key)\n",
+    "\n",
+    "    agent = CodeAgent(tools=tools, model=model, system_prompt=system_message)\n",
+    "    agent.name = name\n",
+    "    agent.description = description\n",
+    "\n",
+    "    return agent"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc9eecba-6524-4771-b388-0c855a3177e4",
+   "metadata": {},
+   "source": [
+    "In this example we use a [factory pattern](https://en.wikipedia.org/wiki/Factory_method_pattern) for agents, to ensure that for every task, a new agent is created. This avoids very long prompts with former, irrelevant conversations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f368b1c5-9ae5-4885-bd38-afb734e3e7c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def agent_factory(*args, **kwargs):\n",
+    "    def create_instance():\n",
+    "        return create_agent(*args, **kwargs)\n",
+    "    return create_instance"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e5fdb19-478b-402a-bba5-8cf1ee85ac51",
+   "metadata": {},
+   "source": [
+    "## Research-assistant agent\n",
+    "We now define tools that can be called by the agent and an agent factory for the research-assistant. This agent can read arxiv meta-data, such as paper titles, authors and summaries."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c7a699fb-621e-4395-b44e-deab68d77878",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tool\n",
+    "def read_arxiv_paper(arxiv_url:str)->str:\n",
+    "    \"\"\"Read the abstract of an arxiv-paper and return most important contents in markdown format.\n",
+    "\n",
+    "    Args:\n",
+    "        arxiv_url: url of the Arxiv paper\n",
+    "    \"\"\"\n",
+    "    if verbose:\n",
+    "        print(f\"read_arxiv_paper({arxiv_url})\")\n",
+    "    arxiv_id = arxiv_url.split(\"/\")[-1]\n",
+    "    metadata = get_arxiv_metadata(arxiv_id)\n",
+    "    title = metadata[\"title\"]\n",
+    "    summary = metadata[\"summary\"]\n",
+    "    authors = \", \".join(metadata[\"authors\"])\n",
+    "    \n",
+    "    return f\"\"\"## {title}\n",
+    "By {authors}\n",
+    "\n",
+    "{summary}\n",
+    "\"\"\"\n",
+    "\n",
+    "research_agent_factory = agent_factory(\n",
+    "    name=\"research-assistant\",\n",
+    "    description=\"Scientific assistant who can read a paper and provide a summary of it.\",\n",
+    "    system_message=\"\"\"You will be tasked to read paper(s) and provide a summary. \n",
+    "    Write a very detailed manuscript about 1000 words outlining the major messages and limitations of a given paper.\"\"\",\n",
+    "    tools=[read_arxiv_paper],\n",
+    "    model=model, \n",
+    "    api_base=api_base, \n",
+    "    api_key=api_key,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2de1b6a8-f6f0-400b-a5ee-4e7414762d57",
+   "metadata": {},
+   "source": [
+    "## Scientific writer\n",
+    "We also define a scientific writer agent which can rewrite a manuscript by incorporating given feedback."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "bcfbb712-078e-444d-aabb-520e01ab1538",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tool\n",
+    "def improve_manuscript(manuscript:str, feedback:str)->str:\n",
+    "    \"\"\"Can improve a given manuscript text according to defined feedback.\n",
+    "\n",
+    "    Args:\n",
+    "        manuscript: The complete manuscript text to improve\n",
+    "        feedback: feedback to incorporate\n",
+    "    \"\"\"\n",
+    "    if verbose:\n",
+    "        short = manuscript[:100]\n",
+    "        num_chars = len(manuscript)\n",
+    "        num_lines = len(manuscript.split(\"\\n\"))\n",
+    "        short_feedback = feedback[:100]\n",
+    "        num_chars_feedback = len(short_feedback)\n",
+    "        print(f\"improve_manuscript({short}...[{num_chars} chars, {num_lines} lines], {short_feedback})... [{num_chars_feedback} chars]\")\n",
+    "    return prompt(f\"\"\"Improve a manuscript by incorporating given feedback.\n",
+    "\n",
+    "## Manuscript\n",
+    "\n",
+    "{manuscript}\n",
+    "\n",
+    "## Feedback\n",
+    "\n",
+    "{feedback}\n",
+    "\n",
+    "## Your task\n",
+    "\n",
+    "Improve the manuscript above by incorporating the feedback. \n",
+    "Do not shorten it! \n",
+    "Do not remove important details! \n",
+    "Use markdown links to cite sources.\n",
+    "Do not make up references!\n",
+    "Return the updated manuscript in markdown format only.\n",
+    "\"\"\")\n",
+    "\n",
+    "scientific_writer_factory = agent_factory(\n",
+    "    name=\"scientific-writer\",\n",
+    "    description=\"Scientific writer who improves manuscripts.\",\n",
+    "    system_message=\"\"\"You will be tasked to rewrite a text by incorporating given feedback.\"\"\",\n",
+    "    tools=[improve_manuscript],\n",
+    "    model=model, \n",
+    "    api_base=api_base, \n",
+    "    api_key=api_key,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ca1c7e93-a42d-48f2-980c-deb5d5afc69b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## for testing:\n",
+    "#research_agent_factory().run(\"Read arxiv paper 2211.11501 and tell me the most important content in one sentence.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "60f73da4-56f0-4166-b61b-4d08d91da8e0",
+   "metadata": {},
+   "source": [
+    "## Reviewer agent\n",
+    "Next we define an LLM-based tool that can generate feedback for a given manuscript. Note: We are using the same LLM/server here as the agents use under the hood. This not necessary. One might use different LLMs for different tasks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "2158fd67-b89a-476f-962d-d3426261b0eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tool\n",
+    "def review_text(manuscript:str)->str:\n",
+    "    \"\"\"Reviews text and provides constructive feedback\n",
+    "    \n",
+    "    Args:\n",
+    "        manuscript: complete original manuscript text to review.    \n",
+    "    \"\"\"\n",
+    "    if verbose:\n",
+    "        short = manuscript[:100]\n",
+    "        num_chars = len(manuscript)\n",
+    "        num_lines = len(manuscript.split(\"\\n\"))\n",
+    "        print(f\"review_text({short}...[{num_chars} chars, {num_lines} lines])\")\n",
+    "    feedback = prompt(f\"\"\"\n",
+    "You are a great reviewer and you like to provide constructive feedback. \n",
+    "If you are provided with a manuscript, you formulate feedback specifically for this manuscript. \n",
+    "Your goal is to guide the author towards writing\n",
+    "* a scientific text with a short and descriptive title,\n",
+    "* a scientific text with markdown sub-sections (# title, ## headlines, ...) avoiding bullet points,\n",
+    "* structured in sub-sections by content, e.g. introduction, recent developments, methods, results, discussion, future work, ...\n",
+    "* text using high-quality scientific language,\n",
+    "* proper citations using markdown links to original paper urls (do not make up references!),\n",
+    "* a clear abstract at the beginning of the text, and conclusions at the end\n",
+    "\n",
+    "## Manuscript\n",
+    "This is the manuscript you are asked to review:\n",
+    "\n",
+    "{manuscript}\n",
+    "\n",
+    "## Your task\n",
+    "Provide constructive feedback to the manuscript above.\n",
+    "\"\"\")\n",
+    "    return feedback\n",
+    "\n",
+    "reviewer_agent_factory  = agent_factory(\n",
+    "    name=\"reviewer\",\n",
+    "    description=\"A reviewer who gives constructive feedback\",\n",
+    "    tools=[review_text],\n",
+    "    model=model, \n",
+    "    api_base=api_base, \n",
+    "    api_key=api_key,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a8e30c5-ac2e-4667-a81a-ba69802aac32",
+   "metadata": {},
+   "source": [
+    "## Printer agent\n",
+    "As it was hard to make the system return the final manuscript, we define an agent who is supposed to print the final manuscript. We can use this strategy to secretly also write the manuscript to a file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "ec6be4fc-61ba-4132-a28a-a2e68b5f9bc9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tool\n",
+    "def print_manuscript(manuscript:str)->str:\n",
+    "    \"\"\"Prints a manuscript out provided in markdown format.\n",
+    "    \n",
+    "    Args:\n",
+    "        manuscript: An original manuscript text to print in markdown format containing all line breaks, headlines, etc\n",
+    "    \"\"\"\n",
+    "    from IPython.display import display, Markdown\n",
+    "    display(Markdown(manuscript))\n",
+    "\n",
+    "    with open(\"manuscript.md\", \"w\") as file:\n",
+    "        file.write(manuscript)\n",
+    "\n",
+    "    return \"The manuscript was printed.\"\n",
+    "\n",
+    "printer_agent_factory  = agent_factory(\n",
+    "    name=\"printer\",\n",
+    "    description=\"A professional printing expert who will print markdown-formatted text.\",\n",
+    "    tools=[print_manuscript],\n",
+    "    model=model, \n",
+    "    api_base=api_base, \n",
+    "    api_key=api_key,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3ffd55f8-129a-42ce-acba-894ef8baada5",
+   "metadata": {},
+   "source": [
+    "## Scheduler agent\n",
+    "The scheduler is given a team of agents and can choose between them. For every task, it creates a new agent using the respective factory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "bcfc1aed-3603-4abd-823d-eb45b8d39b8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "team = [research_agent_factory, reviewer_agent_factory, scientific_writer_factory, printer_agent_factory]\n",
+    "\n",
+    "@tool\n",
+    "def distribute_sub_task(task_description:str, assistant:str)->str:\n",
+    "    \"\"\"Prompt an assistant to solve a certain task. \n",
+    "    \n",
+    "    Args:\n",
+    "        task_description: Detailed task description, to make sure to provide all necessary details. When handling text, hand over the complete original text, unmodified, containing all line-breaks, headlines, etc.\n",
+    "        assistant: name of the assistant that should take care of the task.\n",
+    "    \"\"\"\n",
+    "    for t_factory in team:\n",
+    "        t = t_factory()\n",
+    "        if t.name == assistant:\n",
+    "            if verbose:\n",
+    "                print(\"\".join([\"-\"]*80))\n",
+    "                short = task_description[:100]\n",
+    "                num_chars = len(task_description)\n",
+    "                print(f\"| I am asking {assistant} to take care of: {short}...[{num_chars} chars]\")\n",
+    "\n",
+    "            # execute the task\n",
+    "            result = t.run(task_description)\n",
+    "            \n",
+    "            if verbose:\n",
+    "                short = result[:100]\n",
+    "                num_chars = len(result)\n",
+    "                print(f\"| Response was: {short}...[{num_chars} chars]\")\n",
+    "                print(\"\".join([\"-\"]*80))\n",
+    "\n",
+    "            return result\n",
+    "\n",
+    "    return \"Assistant unknown\"\n",
+    "\n",
+    "team_description = \"\\n\".join([f\"* {t().name}: {t().description}\" for t in team])\n",
+    "    \n",
+    "scheduler = create_agent(\n",
+    "    name=\"scheduler\",\n",
+    "    tools=[distribute_sub_task],\n",
+    "    description=\"Scheduler splits tasks into sub-tasks and distributes them.\",\n",
+    "    system_message=f\"\"\"\n",
+    "You are an editor who has a team of assistants. Your task is to write a manuscript together with your team.\n",
+    "\n",
+    "# Team\n",
+    "Your assistants can either read and summarize papers for you, or review text you wrote and provide feedback.\n",
+    "\n",
+    "Your team members are:\n",
+    "{team_description}\n",
+    "\n",
+    "# Typical workflow\n",
+    "A typical workflow is like this:\n",
+    "* Read papers\n",
+    "* Summarize them in a first manuscript draft\n",
+    "* Review the manuscript\n",
+    "* Incorporate review feedback to improve the manuscript\n",
+    "* Print the final manuscript in markdown format.\n",
+    "\n",
+    "# Hints\n",
+    "When distributing tasks, make sure to provide all necessary details to the assistants. \n",
+    "Never shorten text when giving tasks to assistants. Provided them with the full manuscript text.\n",
+    "\n",
+    "# Your task\n",
+    "Distribute tasks to your team. Goal is to have a great scientific manuscript.\n",
+    "\"\"\",\n",
+    "    model=model, \n",
+    "    api_base=api_base, \n",
+    "    api_key=api_key,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da4d816b-5b24-4acf-9a53-6d5342ae91df",
+   "metadata": {},
+   "source": [
+    "## Writing a manuscript\n",
+    "Finally, we can ask the scheduler to distribute sub-tasks to the agents and produce the final result. Note that the task description is generic. It does not mention what the manuscript should be about. The system has to figure this out by reading the online resources."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "4ef885ad-e7fa-4eb2-97e0-7a70b32bb68f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n",
+      "| I am asking research-assistant to take care of: Read the paper https://arxiv.org/abs/2211.11501 and provide a summary of it....[76 chars]\n",
+      "read_arxiv_paper(https://arxiv.org/abs/2211.11501)\n",
+      "| Response was: The paper introduces the DS-1000 benchmark, which consists of 1000 data science problems spanning 7 ...[371 chars]\n",
+      "--------------------------------------------------------------------------------\n",
+      "--------------------------------------------------------------------------------\n",
+      "| I am asking research-assistant to take care of: Read the paper https://arxiv.org/abs/2308.16458 and provide a summary of it....[76 chars]\n",
+      "read_arxiv_paper(https://arxiv.org/abs/2308.16458)\n",
+      "| Response was: The paper introduces BioCoder, a benchmark for evaluating large language models (LLMs) in generating...[1825 chars]\n",
+      "--------------------------------------------------------------------------------\n",
+      "--------------------------------------------------------------------------------\n",
+      "| I am asking research-assistant to take care of: Read the paper https://arxiv.org/abs/2411.07781 and provide a summary of it....[76 chars]\n",
+      "read_arxiv_paper(https://arxiv.org/abs/2411.07781)\n",
+      "| Response was: The paper proposes a benchmark for evaluating the safety of code agents, highlighting the need for s...[152 chars]\n",
+      "--------------------------------------------------------------------------------\n",
+      "--------------------------------------------------------------------------------\n",
+      "| I am asking research-assistant to take care of: Read the paper https://arxiv.org/abs/2408.13204 and provide a summary of it....[76 chars]\n",
+      "read_arxiv_paper(https://arxiv.org/abs/2408.13204)\n",
+      "| Response was: The paper 'DOMAINEVAL: An Auto-Constructed Benchmark for Multi-Domain Code Generation' introduces a ...[356 chars]\n",
+      "--------------------------------------------------------------------------------\n",
+      "--------------------------------------------------------------------------------\n",
+      "| I am asking research-assistant to take care of: Read the paper https://arxiv.org/abs/2406.15877 and provide a summary of it....[76 chars]\n",
+      "read_arxiv_paper(https://arxiv.org/abs/2406.15877)\n",
+      "| Response was: The paper introduces BigCodeBench, a benchmark that challenges LLMs to invoke multiple function call...[458 chars]\n",
+      "--------------------------------------------------------------------------------\n",
+      "--------------------------------------------------------------------------------\n",
+      "| I am asking reviewer to take care of: Review the manuscript draft: Introduction:\n",
+      "The recent papers https://arxiv.org/abs/2211.11501, https...[3533 chars]\n",
+      "review_text(\n",
+      "Introduction:\n",
+      "The recent papers https://arxiv.org/abs/2211.11501, https://arxiv.org/abs/2308.16458,...[3506 chars, 15 lines])\n",
+      "| Response was: The manuscript draft has been reviewed, and the feedback suggests several improvements to enhance it...[244 chars]\n",
+      "--------------------------------------------------------------------------------\n",
+      "--------------------------------------------------------------------------------\n",
+      "| I am asking scientific-writer to take care of: Improve the manuscript draft: Introduction:\n",
+      "The recent papers https://arxiv.org/abs/2211.11501, http...[3809 chars]\n",
+      "improve_manuscript(\n",
+      "Introduction:\n",
+      "The recent papers https://arxiv.org/abs/2211.11501, https://arxiv.org/abs/2308.16458,...[3506 chars, 15 lines], \n",
+      "The manuscript draft has been reviewed, and the feedback suggests several improvements to enhance i)... [100 chars]\n",
+      "| Response was: \n",
+      "### Introduction\n",
+      "The recent papers [https://arxiv.org/abs/2211.11501](https://arxiv.org/abs/2211.11...[5696 chars]\n",
+      "--------------------------------------------------------------------------------\n",
+      "--------------------------------------------------------------------------------\n",
+      "| I am asking printer to take care of: Print the manuscript: \n",
+      "### Introduction\n",
+      "The recent papers [https://arxiv.org/abs/2211.11501](https:/...[5718 chars]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "\n",
+       "### Introduction\n",
+       "The recent papers [https://arxiv.org/abs/2211.11501](https://arxiv.org/abs/2211.11501), [https://arxiv.org/abs/2308.16458](https://arxiv.org/abs/2308.16458), [https://arxiv.org/abs/2411.07781](https://arxiv.org/abs/2411.07781), [https://arxiv.org/abs/2408.13204](https://arxiv.org/abs/2408.13204), and [https://arxiv.org/abs/2406.15877](https://arxiv.org/abs/2406.15877) introduce new benchmarks and evaluate the performance of large language models (LLMs) on fine-grained tasks.\n",
+       "\n",
+       "### Summary of Papers\n",
+       "The paper introduces the DS-1000 benchmark, which consists of 1000 data science problems spanning 7 Python libraries. The benchmark has three core features: realistic and practical use cases, reliable automatic evaluation, and defense against memorization. The current best public system, Codex-002, achieves 43.3% accuracy on the benchmark, leaving room for improvement.\n",
+       "\n",
+       "The paper introduces [BioCoder](https://arxiv.org/abs/2308.16458), a benchmark for evaluating large language models (LLMs) in generating bioinformatics-specific code. The benchmark consists of 1,026 Python functions and 1,243 Java methods extracted from GitHub, as well as 253 examples from the Rosalind Project. The paper also presents the results of evaluating various models using BioCoder, including InCoder, CodeGen, and GPT-3.5. The results show that successful models accommodate long prompts with full context and contain domain-specific knowledge of bioinformatics. One of the major contributions of the paper is the development of the BioCoder benchmark, which provides a comprehensive evaluation of LLMs in generating bioinformatics code. The benchmark covers a wide range of bioinformatics calculations and includes a fuzz-testing framework for evaluation. The paper also demonstrates the effectiveness of fine-tuning a model (StarCoder) on the BioCoder dataset, resulting in a significant improvement in performance. However, the paper also highlights some limitations of the study. One of the limitations is that the BioCoder benchmark is limited to bioinformatics-specific code and may not be generalizable to other domains. Another limitation is that the evaluation of the models is based on a specific set of metrics (Pass@K), which may not capture all aspects of the models' performance. In conclusion, the paper provides a significant contribution to the field of bioinformatics and natural language processing. The development of the BioCoder benchmark and the evaluation of various models provide valuable insights into the capabilities and limitations of LLMs in generating bioinformatics code. However, further research is needed to address the limitations of the study and to explore the generalizability of the results to other domains.\n",
+       "\n",
+       "The paper proposes a benchmark for evaluating the safety of code agents, highlighting the need for stringent safety evaluations for diverse code agents.\n",
+       "\n",
+       "The paper '[DOMAINEVAL: An Auto-Constructed Benchmark for Multi-Domain Code Generation](https://arxiv.org/abs/2411.07781)' introduces a new benchmark for evaluating LLMs' coding capabilities in multiple domains. The results show that LLMs have strengths and weaknesses in different domains, and highlight the importance of generating more samples and developing more comprehensive benchmarks.\n",
+       "\n",
+       "The paper introduces [BigCodeBench](https://arxiv.org/abs/2408.13204), a benchmark that challenges LLMs to invoke multiple function calls as tools from various libraries and domains for fine-grained tasks. The evaluation of 60 LLMs shows that they are not yet capable of following complex instructions to use function calls precisely, with scores up to 60% compared to human performance of 97%. This highlights the limitations of current LLMs and the need for further advancements in this area.\n",
+       "\n",
+       "### Conclusion\n",
+       "The manuscripts discussed above provide a comprehensive overview of the current state of large language models in fine-grained tasks. By introducing new benchmarks such as DS-1000, BioCoder, DOMAINEVAL, and BigCodeBench, these papers highlight the strengths and limitations of current LLMs and provide valuable insights into their capabilities and limitations. The results of the evaluations demonstrate the need for further research to address the limitations of current LLMs and to develop more comprehensive benchmarks that can accurately assess their performance. Overall, the papers contribute significantly to the field of natural language processing and provide a foundation for future research in this area.\n",
+       "\n",
+       "### Future Work\n",
+       "To further improve the performance of LLMs on fine-grained tasks, future research should focus on developing more comprehensive benchmarks that can accurately assess their capabilities and limitations. Additionally, the development of new models that can accommodate long prompts with full context and contain domain-specific knowledge is crucial. The evaluation of models using a wide range of metrics is also essential to capture all aspects of their performance. By addressing the limitations of current LLMs and developing more comprehensive benchmarks, future research can lead to significant advancements in the field of natural language processing.\n",
+       "\n",
+       "### References\n",
+       "The recent papers [https://arxiv.org/abs/2211.11501](https://arxiv.org/abs/2211.11501), [https://arxiv.org/abs/2308.16458](https://arxiv.org/abs/2308.16458), [https://arxiv.org/abs/2411.07781](https://arxiv.org/abs/2411.07781), [https://arxiv.org/abs/2408.13204](https://arxiv.org/abs/2408.13204), and [https://arxiv.org/abs/2406.15877](https://arxiv.org/abs/2406.15877) provide a comprehensive overview of the current state of large language models in fine-grained tasks.\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "| Response was: The manuscript has been printed....[32 chars]\n",
+      "--------------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "manuscript = scheduler.run(\"\"\"\n",
+    "Please take care of ALL the following tasks:\n",
+    "* Read these papers and summarize them individually.\n",
+    "  * https://arxiv.org/abs/2211.11501\n",
+    "  * https://arxiv.org/abs/2308.16458\n",
+    "  * https://arxiv.org/abs/2411.07781\n",
+    "  * https://arxiv.org/abs/2408.13204\n",
+    "  * https://arxiv.org/abs/2406.15877\n",
+    "* write a manuscript text about the papers, \n",
+    "* review the manuscript to get constructive feedback\n",
+    "* improve the manuscript \n",
+    "* print the final manuscript\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c585f2f1-2c07-4573-8d30-415a6f83cb50",
+   "metadata": {},
+   "source": [
+    "## Exercise\n",
+    "Modify the task above and challenge the system. Provide URLs to arxiv papers of different topic, e.g. to papers you know well. Also provide urls to paper which don't exist.\n",
+    "\n",
+    "Question the output: How much of the text is made up, how much was actually content of the paper (abstracts)?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "345fc719-7590-48e2-be9c-2b8fed3ffb71",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "76361725-6474-4c21-a812-e039a6a8c061",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb004698-edc1-4f8e-8f29-c007b5a0007d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/_toc.yml b/docs/_toc.yml
index 21a0d56..aad3542 100644
--- a/docs/_toc.yml
+++ b/docs/_toc.yml
@@ -116,8 +116,9 @@ parts:
       - file: 30_function_calling/55_microscope_stage_demo.ipynb
       - file: 35_agents/autogen_agentchat.ipynb
       - file: 35_agents/react_agent_llama_index.ipynb
-      - file: 35_agents/smolagents.ipynb
       - file: 66_arxiv_agent/arxiv_powerpoint_karaoke.ipynb
+      - file: 35_agents/smolagents.ipynb
+      - file: 66_arxiv_agent/multiagent_write_review.ipynb
 
     - file: 70_fine_tuning/readme.md
       sections: