diff --git a/notebooks/AutoResearch_w_RAG_LlamaIndex.ipynb b/notebooks/AutoResearch_w_RAG_LlamaIndex.ipynb
new file mode 100644
index 000000000..e2badd7c2
--- /dev/null
+++ b/notebooks/AutoResearch_w_RAG_LlamaIndex.ipynb
@@ -0,0 +1,673 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Paper Savior with LionAGI and LlamaIndex Vector Index\n",
+ "\n",
+ "-- how to do auto explorative research with LionAGI plus RAG using llamaindex Vector Index & embedding \n",
+ "\n",
+ "- [LionAGI](https://github.com/lion-agi/lionagi)\n",
+ "- [LlamaIndex](https://www.llamaindex.ai)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# %pip install lionagi pypdf llama_index"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import lionagi as li"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1. Build a Vector Index with llama_index"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define a function to get index\n",
+ "\n",
+ "def get_index(chunks):\n",
+ " from llama_index import ServiceContext, VectorStoreIndex\n",
+ " from llama_index.llms import OpenAI\n",
+ "\n",
+ " llm = OpenAI(temperature=0.1, model=\"gpt-4-1106-preview\")\n",
+ " service_context = ServiceContext.from_defaults(llm=llm)\n",
+ " return VectorStoreIndex(chunks, include_embeddings=True, service_context=service_context)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# get llamaindex textnodes, if to_datanode is false, you will get Lion DataNode\n",
+ "text_nodes = li.load(\n",
+ " 'SimpleDirectoryReader', reader_type='llama_index', reader_args=['papers/'], \n",
+ " to_datanode=False\n",
+ ")\n",
+ "\n",
+ "chunks = li.chunk(\n",
+ " documents=text_nodes, chunker_type = 'llama_index', chunker='SentenceSplitter', \n",
+ " chunker_kwargs={'chunk_size': 512, 'chunk_overlap':20}, to_datanode=False, \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "index = get_index(chunks)\n",
+ "query_engine = index.as_query_engine(include_text=False, response_mode=\"tree_summarize\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2. Write a tool description according to OpenAI schema"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tool_schema = {\n",
+ " \"type\": \"function\",\n",
+ " \"function\": {\n",
+ " \"name\": \"query_arxiv_papers\",\n",
+ " \"description\": \"\"\"\n",
+ " Perform a query to a QA bot with access to an \n",
+ " index built with papers from arxiv\n",
+ " \"\"\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"str_or_query_bundle\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"a question to ask the QA bot\",\n",
+ " }\n",
+ " },\n",
+ " \"required\": [\"str_or_query_bundle\"],\n",
+ " },\n",
+ " }\n",
+ " }\n",
+ "\n",
+ "\n",
+ "# we will need to register both the function description \n",
+ "# and actual implementation\n",
+ "tool = li.Tool(func=query_engine.query, parser=lambda x: str(x.response), schema_=tool_schema)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3. Research: PROMPTS"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### FORMATS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# a rigidly set up prompt can help make outcome more deterministic\n",
+ "# though any string will work as well. \n",
+ "system = {\n",
+ " \"persona\": \"a helpful world-class researcher\",\n",
+ " \"requirements\": \"\"\"\n",
+ " think step by step before returning a clear, precise \n",
+ " worded answer with a humble yet confident tone\n",
+ " \"\"\",\n",
+ " \"responsibilities\": f\"\"\"\n",
+ " you are asked to help with researching on the topic \n",
+ " of Large Language Model\n",
+ " \"\"\",\n",
+ " \"tools\": \"provided with a QA bot for grounding responses\"\n",
+ "}\n",
+ "\n",
+ "# similarly, we can pass in any string or dictionary to instruction\n",
+ "# here we are modifying model behavior by telling mdel how to output \n",
+ "deliver_format1 = {\"return required\": \"yes\", \"return format\": \"paragraph\"}\n",
+ "\n",
+ "deliver_format2 = {\"return required\": \"yes\", \n",
+ " \"return format\": { \n",
+ " \"json_mode\": {\n",
+ " 'paper': \"paper_name\",\n",
+ " \"summary\": \"...\", \n",
+ " \"research question\": \"...\", \n",
+ " \"talking points\": {\n",
+ " \"point 1\": \"...\",\n",
+ " \"point 2\": \"...\",\n",
+ " \"point 3\": \"...\"\n",
+ " }}}}\n",
+ " \n",
+ "function_call = {\n",
+ " \"notice\":\"\"\"\n",
+ " At each task step, identified by step number, you must use the tool \n",
+ " at least twice. Notice you are provided with a QA bot as your tool, \n",
+ " the bot has access to the 2 papers via a queriable index \n",
+ " that takes natural language query and return a natural language \n",
+ " answer. You can decide whether to invoke the function call, you will \n",
+ " need to ask the bot when there are things need clarification or \n",
+ " further information. you provide the query by asking a question, \n",
+ " please use the tool as extensively as you can.\n",
+ " \"\"\"\n",
+ " }\n",
+ "\n",
+ "# here we create a two step process imitating the steps human would take to \n",
+ "# perform the research task\n",
+ "instruct1 = {\n",
+ " \"task step\": \"1\", \n",
+ " \"task name\": \"read paper abstracts\", \n",
+ " \"task objective\": \"get initial understanding of the papers of interest\", \n",
+ " \"task description\": \"\"\"\n",
+ " provided with abstracts of paper, provide a brief summary \n",
+ " highlighting the paper core points, the purpose is to extract \n",
+ " as much information as possible\n",
+ " \"\"\",\n",
+ " \"deliverable\": deliver_format1\n",
+ "}\n",
+ "\n",
+ "\n",
+ "instruct2 = {\n",
+ " \"task step\": \"2\",\n",
+ " \"task name\": \"propose research questions and talking points\", \n",
+ " \"task objective\": \"initial brainstorming\", \n",
+ " \"task description\": \"\"\"\n",
+ " from the improved understanding of the paper, please propose \n",
+ " an interesting, unique and practical research question, \n",
+ " support your reasoning. Kept on asking questions if things are \n",
+ " not clear. \n",
+ " \"\"\",\n",
+ " \"deliverable\": deliver_format2,\n",
+ " \"function calling\": function_call\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4. Research: Setup Workflow"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "abstracts = \"\"\"\n",
+ "Abstract—Large language models (LLMs), such as ChatGPT and GPT4, are making new waves in the field of natural language processing and artificial intelligence, due to their emergent ability and generalizability. However, LLMs are black-box models, which often fall short of capturing and accessing factual knowledge. In contrast, Knowledge Graphs (KGs), Wikipedia and Huapu for example, are structured knowledge models that explicitly store rich factual knowledge. KGs can enhance LLMs by providing external knowledge for inference and interpretability. Meanwhile, KGs are difficult to construct and evolving by nature, which challenges the existing methods in KGs to generate new facts and represent unseen knowledge. Therefore, it is complementary to unify LLMs and KGs together and simultaneously leverage their advantages. In this article, we present a forward-looking roadmap for the unification of LLMs and KGs. Our roadmap consists of three general frameworks, namely, 1) KG-enhanced LLMs, which incorporate KGs during the pre-training and inference phases of LLMs, or for the purpose of enhancing understanding of the knowledge learned by LLMs; 2) LLM-augmented KGs, that leverage LLMs for different KG tasks such as embedding, completion, construction, graph-to-text generation, and question answering; and 3) Synergized LLMs + KGs, in which LLMs and KGs play equal roles and work in a mutually beneficial way to enhance both LLMs and KGs for bidirectional reasoning driven by both data and knowledge. We review and summarize existing efforts within these three frameworks in our roadmap and pinpoint their future research directions.\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "async def read_propose(context, num=5):\n",
+ " researcher = li.Session(system, dir=dir)\n",
+ " researcher.register_tools(tool)\n",
+ " \n",
+ " await researcher.initiate(instruct1, context=context, temperature=0.7)\n",
+ " await researcher.auto_followup(instruct2, tools=True, num=num)\n",
+ " \n",
+ " # researcher.messages_to_csv()\n",
+ " # researcher.log_to_csv()\n",
+ " return researcher"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 5. Research: Run the workflow"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "researcher = li.to_list(\n",
+ " await li.alcall(abstracts, read_propose), flatten=True\n",
+ ")[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " node_id | \n",
+ " role | \n",
+ " name | \n",
+ " timestamp | \n",
+ " content | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " e3d8202ddcd80950b619664e6030566a | \n",
+ " system | \n",
+ " system | \n",
+ " 2024-01-18 13:31:27.516618 | \n",
+ " {\"system_info\": {\"persona\": \"a helpful world-c... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " d6f63a57c38f0382a40c21cb08d292b2 | \n",
+ " user | \n",
+ " user | \n",
+ " 2024-01-18 13:31:27.517302 | \n",
+ " {\"instruction\": {\"task step\": \"1\", \"task name\"... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4e1e291bd9ab5f3181a0b8d7519eb7ed | \n",
+ " assistant | \n",
+ " assistant | \n",
+ " 2024-01-18 13:31:44.000922 | \n",
+ " {\"response\": \"Certainly, the abstract provided... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 93f6af19b2432c802d12a8f68285b903 | \n",
+ " user | \n",
+ " user | \n",
+ " 2024-01-18 13:31:44.002273 | \n",
+ " {\"instruction\": {\"task step\": \"2\", \"task name\"... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0c395f94f6e27b2ae69d8814bcf16c30 | \n",
+ " assistant | \n",
+ " action_request | \n",
+ " 2024-01-18 13:31:52.241974 | \n",
+ " {\"action_list\": [{\"action\": \"action_query_arxi... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " node_id role name \\\n",
+ "0 e3d8202ddcd80950b619664e6030566a system system \n",
+ "1 d6f63a57c38f0382a40c21cb08d292b2 user user \n",
+ "2 4e1e291bd9ab5f3181a0b8d7519eb7ed assistant assistant \n",
+ "3 93f6af19b2432c802d12a8f68285b903 user user \n",
+ "4 0c395f94f6e27b2ae69d8814bcf16c30 assistant action_request \n",
+ "\n",
+ " timestamp \\\n",
+ "0 2024-01-18 13:31:27.516618 \n",
+ "1 2024-01-18 13:31:27.517302 \n",
+ "2 2024-01-18 13:31:44.000922 \n",
+ "3 2024-01-18 13:31:44.002273 \n",
+ "4 2024-01-18 13:31:52.241974 \n",
+ "\n",
+ " content \n",
+ "0 {\"system_info\": {\"persona\": \"a helpful world-c... \n",
+ "1 {\"instruction\": {\"task step\": \"1\", \"task name\"... \n",
+ "2 {\"response\": \"Certainly, the abstract provided... \n",
+ "3 {\"instruction\": {\"task step\": \"2\", \"task name\"... \n",
+ "4 {\"action_list\": [{\"action\": \"action_query_arxi... "
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = researcher.conversation.messages\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['system', 'user', 'assistant', 'action_request', 'action_response'],\n",
+ " dtype=object)"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.name.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " \"action\": \"action_query_arxiv_papers\",\n",
+ " \"arguments\": \"{\\\"str_or_query_bundle\\\":\\\"What are the current challenges in integrating Knowledge Graphs with Large Language Models?\\\"}\"\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# let us check the questions assistant asked\n",
+ "df_requests = df[df.name == \"action_request\"]\n",
+ "\n",
+ "for content in df_requests.content:\n",
+ " for i in li.as_dict(content)['action_list']:\n",
+ " print(li.to_readable_dict(i))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from IPython.display import Markdown"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " node_id | \n",
+ " role | \n",
+ " name | \n",
+ " timestamp | \n",
+ " content | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 5 | \n",
+ " c0699e2742950b636ad61a48650b8ea4 | \n",
+ " assistant | \n",
+ " action_response | \n",
+ " 2024-01-18 13:32:25.292728 | \n",
+ " {\"action_response\": {\"function\": \"query_arxiv_... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " node_id role name \\\n",
+ "5 c0699e2742950b636ad61a48650b8ea4 assistant action_response \n",
+ "\n",
+ " timestamp \\\n",
+ "5 2024-01-18 13:32:25.292728 \n",
+ "\n",
+ " content \n",
+ "5 {\"action_response\": {\"function\": \"query_arxiv_... "
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# let us check the answers from query engine\n",
+ "df_response= df[df.name == \"action_response\"]\n",
+ "df_requests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/markdown": [
+ "Current challenges in integrating Knowledge Graphs with Large Language Models (LLMs) include:\n",
+ "\n",
+ "1. **Scalability**: As LLMs and knowledge graphs grow in size, it becomes increasingly difficult to efficiently integrate and update the vast amounts of information contained within them.\n",
+ "\n",
+ "2. **Alignment**: Ensuring that the knowledge graph's structured information aligns with the LLM's learned representations can be challenging, as LLMs may develop their own idiosyncratic understanding of concepts.\n",
+ "\n",
+ "3. **Dynamic Knowledge**: Knowledge graphs need to be constantly updated to reflect new information, but integrating these updates into an LLM that has been trained on a static snapshot of data can be problematic.\n",
+ "\n",
+ "4. **Reasoning and Inference**: While LLMs are adept at generating human-like text, they may struggle with logical reasoning or inference tasks that knowledge graphs can support. Bridging the gap between neural text generation and structured logical reasoning is a non-trivial challenge.\n",
+ "\n",
+ "5. **Contextual Understanding**: LLMs may not always effectively leverage the context provided by a knowledge graph, leading to responses that are factually incorrect or lack relevance.\n",
+ "\n",
+ "6. **Complex Queries**: Handling complex queries that require multi-hop reasoning over a knowledge graph is difficult, as it requires the LLM to maintain coherence over long text generations and to accurately access and apply relevant information from the graph.\n",
+ "\n",
+ "7. **Interpretability**: Ensuring that the integration of knowledge graphs into LLMs is interpretable and transparent is important for trust and reliability, but this remains a difficult task given the often opaque nature of neural network decision-making processes.\n",
+ "\n",
+ "8. **Data Quality and Bias**: The quality of the data in the knowledge graph can affect the performance of the LLM, and biases present in the data can propagate through the model, leading to biased outputs.\n",
+ "\n",
+ "Addressing these challenges requires ongoing research and development in the fields of machine learning, natural language processing, and knowledge representation."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Markdown(li.as_dict(content)['action_response']['output'])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now let us read the assistant's responses"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_assistant = df[df.name == \"assistant\"]\n",
+ "len(df_assistant)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/markdown": [
+ "Certainly, the abstract provided outlines the interplay between Large Language Models (LLMs) like ChatGPT and GPT-4, and Knowledge Graphs (KGs) such as Wikipedia and Huapu. The core point of the paper is that while LLMs are powerful in processing natural language, they tend to lack in capturing and accessing concrete factual knowledge, which is where KGs excel. The paper's purpose is to explore ways to unify LLMs and KGs to harness their respective strengths. \n",
+ "\n",
+ "The authors propose a roadmap with three general frameworks for this unification: \n",
+ "\n",
+ "1. KG-enhanced LLMs, which integrate KGs into various stages of LLM development and usage, either to assist with pre-training and inference or to improve the LLMs' grasp of the knowledge they've learned.\n",
+ "\n",
+ "2. LLM-augmented KGs, in which LLMs are utilized to perform tasks related to KGs, including embedding, completion, construction, and more complex functions like graph-to-text generation and question answering.\n",
+ "\n",
+ "3. Synergized LLMs + KGs, a model where LLMs and KGs collaborate closely, providing mutual benefits and enabling bidirectional reasoning that incorporates both data and knowledge.\n",
+ "\n",
+ "The abstract concludes by reviewing existing efforts in these areas and suggesting future research directions, indicating that this is a forward-looking and potentially transformative approach to advancing the field of AI and natural language understanding."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# the first response corresponds to the first user instruction, which is to read through the abstract\n",
+ "\n",
+ "response1 = li.as_dict(df_assistant.content.iloc[0])['response']\n",
+ "Markdown(response1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/markdown": [
+ "Based on the improved understanding of the challenges in integrating Knowledge Graphs with Large Language Models, a research question that arises might be:\n",
+ "\n",
+ "**Research Question:** How can we develop an adaptive integration framework for LLMs and KGs that maintains the up-to-dateness of the knowledge graph while ensuring the scalability and alignment of the LLM?\n",
+ "\n",
+ "**Supporting Reasoning:** This question is practical as it addresses the dynamic nature of knowledge and the need for LLMs to continuously learn from updated information. It is unique in its focus on creating an adaptive framework that can handle the scalability issues that come with the ever-growing size of LLMs and KGs, as well as ensuring that the LLM's learned representations align with the structured information of the KG.\n",
+ "\n",
+ "**Talking Points:**\n",
+ "- **Point 1:** Scalability and efficiency are major concerns as both LLMs and KGs grow; an adaptive framework could include mechanisms for incremental learning or modular updates that prevent the need for retraining from scratch.\n",
+ "- **Point 2:** Alignment between the evolving representations of knowledge in LLMs and the structured format of KGs requires continuous synchronization methods, possibly utilizing advanced alignment algorithms or transfer learning techniques.\n",
+ "- **Point 3:** Keeping the knowledge graph up-to-date in a way that the LLM can efficiently utilize is crucial; this might involve real-time updating mechanisms or periodic 'knowledge refreshes' that the LLM can integrate without compromising performance. \n",
+ "\n",
+ "To further clarify the potential of this research direction, I will invoke the function call once more to ask a follow-up question.\n",
+ "\n",
+ "{\"action_list\": [{\"action\": \"action_query_arxiv_papers\", \"arguments\": \"{\\\"str_or_query_bundle\\\":\\\"What are the latest approaches to ensuring the scalability and alignment of LLMs in the context of knowledge graph integration?\\\"}\"}]}"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 61,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# the second is the second instruciton, which is the final output in this case\n",
+ "\n",
+ "response2 = li.as_dict(df_assistant.content.iloc[1])['response']\n",
+ "Markdown(response2)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "lion_dev",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/auto_Research_with_LlamaIndex.ipynb b/notebooks/auto_Research_with_LlamaIndex.ipynb
deleted file mode 100644
index 44cc28b25..000000000
--- a/notebooks/auto_Research_with_LlamaIndex.ipynb
+++ /dev/null
@@ -1,367 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Paper Savior with LionAGI and LlamaIndex Vector Index\n",
- "\n",
- "-- how to do auto explorative research with LionAGI plus RAG using llamaindex Vector Index & embedding \n",
- "\n",
- "- [LionAGI](https://github.com/lion-agi/lionagi)\n",
- "- [LlamaIndex](https://www.llamaindex.ai)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "# %pip install lionagi pypdf llama_index"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 1. Build a Vector Index with llama_index"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "from llama_index import SimpleDirectoryReader\n",
- "from llama_index.node_parser import SentenceSplitter\n",
- "from llama_index import ServiceContext, VectorStoreIndex\n",
- "from llama_index.llms import OpenAI\n",
- "\n",
- "\n",
- "loader = SimpleDirectoryReader(input_dir='papers/', required_exts='.pdf')\n",
- "node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)\n",
- "documents = loader.load_data(show_progress=False)\n",
- "\n",
- "nodes = node_parser.get_nodes_from_documents(documents, show_progress=False)\n",
- "\n",
- "# set up index object\n",
- "llm = OpenAI(temperature=0.1, model=\"gpt-4-1106-preview\")\n",
- "service_context = ServiceContext.from_defaults(llm=llm)\n",
- "index1 = VectorStoreIndex(nodes, include_embeddings=True, \n",
- " service_context=service_context)\n",
- "\n",
- "# set up query engine\n",
- "query_engine = index1.as_query_engine(\n",
- " include_text=False, response_mode=\"tree_summarize\"\n",
- " )"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 2. Write a tool description according to OpenAI schema"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "import lionagi as li"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "tools = [\n",
- " {\n",
- " \"type\": \"function\",\n",
- " \"function\": {\n",
- " \"name\": \"query_arxiv_papers\",\n",
- " \"description\": \"\"\"\n",
- " Perform a query to a QA bot with access to an \n",
- " index built with papers from arxiv\n",
- " \"\"\",\n",
- " \"parameters\": {\n",
- " \"type\": \"object\",\n",
- " \"properties\": {\n",
- " \"str_or_query_bundle\": {\n",
- " \"type\": \"string\",\n",
- " \"description\": \"a question to ask the QA bot\",\n",
- " }\n",
- " },\n",
- " \"required\": [\"str_or_query_bundle\"],\n",
- " },\n",
- " }\n",
- " }\n",
- "]\n",
- "\n",
- "# we will need to register both the function description \n",
- "# and actual implementation\n",
- "tool = li.Tool(func=query_engine.query, parser=lambda x: x.response, schema_=tools[0])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 3. Research: PROMPTS"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### FORMATS"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "# a rigidly set up prompt can help make outcome more deterministic\n",
- "# though any string will work as well. \n",
- "system = {\n",
- " \"persona\": \"a helpful world-class researcher\",\n",
- " \"requirements\": \"\"\"\n",
- " think step by step before returning a clear, precise \n",
- " worded answer with a humble yet confident tone\n",
- " \"\"\",\n",
- " \"responsibilities\": f\"\"\"\n",
- " you are asked to help with researching on the topic \n",
- " of {query}\n",
- " \"\"\",\n",
- " \"tools\": \"provided with a QA bot for grounding responses\"\n",
- "}\n",
- "\n",
- "# similarly, we can pass in any string or dictionary to instruction\n",
- "# here we are modifying model behavior by telling mdel how to output \n",
- "deliver_format1 = {\"return required\": \"yes\", \"return format\": \"paragraph\"}\n",
- "\n",
- "deliver_format2 = {\"return required\": \"yes\", \n",
- " \"return format\": { \n",
- " \"json_mode\": {\n",
- " 'paper': \"paper_name\",\n",
- " \"summary\": \"...\", \n",
- " \"research question\": \"...\", \n",
- " \"talking points\": {\n",
- " \"point 1\": \"...\",\n",
- " \"point 2\": \"...\",\n",
- " \"point 3\": \"...\"\n",
- " }}}}\n",
- " \n",
- "function_call = {\n",
- " \"notice\":f\"\"\"\n",
- " At each task step, identified by step number, you must use the tool \n",
- " at least twice. Notice you are provided with a QA bot as your tool, \n",
- " the bot has access to the {num_papers} papers via a queriable index \n",
- " that takes natural language query and return a natural language \n",
- " answer. You can decide whether to invoke the function call, you will \n",
- " need to ask the bot when there are things need clarification or \n",
- " further information. you provide the query by asking a question, \n",
- " please use the tool as extensively as you can.\n",
- " \"\"\"\n",
- " }\n",
- "\n",
- "# here we create a two step process imitating the steps human would take to \n",
- "# perform the research task\n",
- "instruct1 = {\n",
- " \"task step\": \"1\", \n",
- " \"task name\": \"read paper abstracts\", \n",
- " \"task objective\": \"get initial understanding of the papers of interest\", \n",
- " \"task description\": \"\"\"\n",
- " provided with abstracts of paper, provide a brief summary \n",
- " highlighting the paper core points, the purpose is to extract \n",
- " as much information as possible\n",
- " \"\"\",\n",
- " \"deliverable\": deliver_format1\n",
- "}\n",
- "\n",
- "\n",
- "instruct2 = {\n",
- " \"task step\": \"2\",\n",
- " \"task name\": \"propose research questions and talking points\", \n",
- " \"task objective\": \"initial brainstorming\", \n",
- " \"task description\": \"\"\"\n",
- " from the improved understanding of the paper, please propose \n",
- " an interesting, unique and practical research question, \n",
- " support your reasoning. Kept on asking questions if things are \n",
- " not clear. \n",
- " \"\"\",\n",
- " \"deliverable\": deliver_format2,\n",
- " \"function calling\": function_call\n",
- "}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 4. Research: Setup Workflow"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "abstracts = \"\"\"\n",
- "Abstract—Large language models (LLMs), such as ChatGPT and GPT4, are making new waves in the field of natural language processing and artificial intelligence, due to their emergent ability and generalizability. However, LLMs are black-box models, which often fall short of capturing and accessing factual knowledge. In contrast, Knowledge Graphs (KGs), Wikipedia and Huapu for example, are structured knowledge models that explicitly store rich factual knowledge. KGs can enhance LLMs by providing external knowledge for inference and interpretability. Meanwhile, KGs are difficult to construct and evolving by nature, which challenges the existing methods in KGs to generate new facts and represent unseen knowledge. Therefore, it is complementary to unify LLMs and KGs together and simultaneously leverage their advantages. In this article, we present a forward-looking roadmap for the unification of LLMs and KGs. Our roadmap consists of three general frameworks, namely, 1) KG-enhanced LLMs, which incorporate KGs during the pre-training and inference phases of LLMs, or for the purpose of enhancing understanding of the knowledge learned by LLMs; 2) LLM-augmented KGs, that leverage LLMs for different KG tasks such as embedding, completion, construction, graph-to-text generation, and question answering; and 3) Synergized LLMs + KGs, in which LLMs and KGs play equal roles and work in a mutually beneficial way to enhance both LLMs and KGs for bidirectional reasoning driven by both data and knowledge. We review and summarize existing efforts within these three frameworks in our roadmap and pinpoint their future research directions.\n",
- "\"\"\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "async def read_propose(context, num=5):\n",
- " researcher = li.Session(system, dir=dir)\n",
- " researcher.register_tools(tool)\n",
- " \n",
- " await researcher.initiate(instruct1, context=context, temperature=0.7)\n",
- " await researcher.auto_followup(instruct2, tools=tools, num=num)\n",
- " \n",
- " # researcher.messages_to_csv()\n",
- " # researcher.log_to_csv()\n",
- " return researcher"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 5. Research: Run the workflow"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2024-01-14 13:50:30,363 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
- "2024-01-14 13:50:45,724 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
- ]
- }
- ],
- "source": [
- "researcher = await li.alcall(abstracts, read_propose)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "researcher = researcher[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The provided abstract discusses the integration of Large Language Models (LLMs) like ChatGPT and GPT4 with Knowledge Graphs (KGs), such as Wikipedia and Huapu, to enhance the capabilities of both systems. LLMs are adept at natural language processing but are often criticized for being \"black-box\" models with limitations in accessing factual knowledge. KGs, on the other hand, store factual knowledge explicitly but are complex to construct and maintain. The paper proposes a roadmap for combining LLMs and KGs to exploit their respective strengths and mitigate their weaknesses. This unification is structured into three frameworks: KG-enhanced LLMs, which integrate KGs into the training and inference stages of LLMs; LLM-augmented KGs, which use LLMs to assist various KG tasks; and Synergized LLMs + KGs, where both systems work together to improve bidirectional reasoning. The abstract highlights existing efforts within these frameworks and suggests future research directions.\n",
- "\n",
- "{\"function_list\": [{\"function\": \"func_query_arxiv_papers\", \"arguments\": \"{\\\"str_or_query_bundle\\\":\\\"What are the potential challenges and limitations of unifying LLMs with KGs?\\\"}\"}]}\n",
- "\n",
- "{\"function\": \"query_arxiv_papers\", \"arguments\": {\"str_or_query_bundle\": \"What are the potential challenges and limitations of unifying LLMs with KGs?\"}, \"output\": \"Unifying large language models (LLMs) with knowledge graphs (KGs) can present several challenges and limitations. One potential challenge is the integration of reasoning and acting capabilities within LLMs, as highlighted by the development of methods like ReAct. While ReAct aims to synergize these aspects, complex tasks with large action spaces may require more demonstrations to learn effectively, which can exceed the input length limits of in-context learning. This indicates a limitation in the current capacity of LLMs to handle extensive information without additional fine-tuning or expansion of input capabilities.\\n\\nAnother challenge is the preservation of essential information when attempting to compress prompts to accommodate the limitations of LLMs, especially when accessed via APIs. Approaches like LLMLingua address the need to compress prompts without losing critical information, but this also introduces the complexity of ensuring that the compressed content maintains its relevance and coherence. The interdependence between compressed contents and the correspondence between the LLM and the model used for prompt compression must be carefully managed to avoid degradation in performance.\\n\\nFurthermore, the selective-context method, which involves dropping less informative content for prompt compression, may overlook the nuanced interplay between different pieces of information. This could lead to a loss of context or meaning, which is particularly problematic when integrating with KGs that rely on the precise interlinking of information.\\n\\nIn summary, the challenges and limitations of unifying LLMs with KGs include managing the complexity of reasoning and acting within large action spaces, preserving essential information during prompt compression, and ensuring the relevance and coherence of compressed content. Additionally, there is a need to consider the interdependence of information and the compatibility between different models used in the compression and application of LLMs.\"}\n",
- "\n",
- "Based on the understanding gained from the abstract and the additional insights into the challenges of unifying Large Language Models (LLMs) with Knowledge Graphs (KGs), I propose the following research question and talking points:\n",
- "\n",
- "**Research Question:**\n",
- "How can we develop a dynamic compression algorithm that selectively integrates KG information into LLM prompts without losing critical information and maintaining coherence, especially when dealing with large action spaces and API limitations?\n",
- "\n",
- "**Talking Points:**\n",
- "1. **Point 1:** Addressing the challenge of integrating reasoning and action capabilities within LLMs, such as the ReAct method, and exploring how to expand the input length limits of in-context learning to accommodate complex tasks with large action spaces.\n",
- "2. **Point 2:** Investigating techniques for prompt compression, like those used in LLMLingua, to maintain essential information and relevance during the compression process, ensuring that the LLM can still effectively utilize the compressed knowledge.\n",
- "3. **Point 3:** Considering the selective-context method's limitations in information retention, and developing strategies to prevent loss of context or meaning, which is crucial for the precise interlinking of information in KGs.\n",
- "\n",
- "To support further exploration and clarification on these points, I will use the QA bot to inquire about specific methods and approaches that exist for prompt compression and the interdependence of information within LLMs and KGs. This will help refine the research question and validate the talking points.\n",
- "\n"
- ]
- }
- ],
- "source": [
- "for msg in researcher.conversation.messages:\n",
- " if msg.role == \"assistant\":\n",
- " print(f\"{msg.msg_content}\\n\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/markdown": [
- "Based on the understanding gained from the abstract and the additional insights into the challenges of unifying Large Language Models (LLMs) with Knowledge Graphs (KGs), I propose the following research question and talking points:\n",
- "\n",
- "**Research Question:**\n",
- "How can we develop a dynamic compression algorithm that selectively integrates KG information into LLM prompts without losing critical information and maintaining coherence, especially when dealing with large action spaces and API limitations?\n",
- "\n",
- "**Talking Points:**\n",
- "1. **Point 1:** Addressing the challenge of integrating reasoning and action capabilities within LLMs, such as the ReAct method, and exploring how to expand the input length limits of in-context learning to accommodate complex tasks with large action spaces.\n",
- "2. **Point 2:** Investigating techniques for prompt compression, like those used in LLMLingua, to maintain essential information and relevance during the compression process, ensuring that the LLM can still effectively utilize the compressed knowledge.\n",
- "3. **Point 3:** Considering the selective-context method's limitations in information retention, and developing strategies to prevent loss of context or meaning, which is crucial for the precise interlinking of information in KGs.\n",
- "\n",
- "To support further exploration and clarification on these points, I will use the QA bot to inquire about specific methods and approaches that exist for prompt compression and the interdependence of information within LLMs and KGs. This will help refine the research question and validate the talking points."
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from IPython.display import Markdown\n",
- "Markdown(researcher.conversation.messages[-1].msg_content)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "lion_dev",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.7"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}