Aggregate-Intellect · saminegash · Jul 7, 2023 · Jul 7, 2023 · Jul 7, 2023 · Jul 7, 2023
diff --git a/apps/slackbot/bolt_app.py b/apps/slackbot/bolt_app.py
@@ -8,21 +8,11 @@
 from flask import Flask, request
 load_dotenv()
 from langchain.chat_models import ChatOpenAI
-from langchain import LLMChain
-from langchain.chains.question_answering import load_qa_chain
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.text_splitter import CharacterTextSplitter
-from langchain.vectorstores import FAISS 
-from langchain.llms import OpenAI
 from os import environ
-from vectorstores import ConversationStore
-from prompt import SlackBotPrompt
+from vectorstores import ConversationStore, LocalChromaStore
 from slack_bolt import App
 from slack_bolt.adapter.flask import SlackRequestHandler
-from langchain.agents import Tool
-from tools import SearchTool, get_tools
-from langchain.agents import initialize_agent
-from langchain.agents import AgentType
+from tools import get_tools
 from task_agent import TaskAgent
 
 
@@ -135,24 +125,31 @@ def slack_events():
 def hello():
     return "OK"
 
+if 'PINECONE_API_KEY'not in os.environ:
+    print("Warning: Pinecone API key not specified. Using local Chroma database.")
+    local_memory = LocalChromaStore.from_folder('files', OPENAI_KEY).as_retriever()
+
 def get_response(question, previous_messages):
     llm = ChatOpenAI(
         openai_api_key=OPENAI_KEY, request_timeout=120
     )
-
-    # prompt = SlackBotPrompt(
-    #    ai_name='Sherpa',
-    #    ai_id=bot['user_id'],
-    #    token_counter=llm.get_num_tokens,
-    #    input_variables=['query', 'messages', 'retriever']
-    # )
 
-    memory = ConversationStore.get_vector_retrieval(
-       'ReadTheDocs', OPENAI_KEY, index_name=os.getenv("PINECONE_INDEX")
-    )
+    if os.environ.get("PINECONE_API_KEY", False):
+      # If pinecone API is specified, then use the Pinecone Database
+      memory = ConversationStore.get_vector_retrieval(
+        'ReadTheDocs', OPENAI_KEY, index_name=os.getenv("PINECONE_INDEX"), search_type='similarity_score_threshold', search_kwargs={'score_threshold': 0.0}
+      )
+    else:
+      # use the local Chroma database
+      memory = local_memory
+
 
     tools=get_tools(memory)
-
+    ai_name='Sherpa'
+
+    ai_id = bot['user_id']
+    question = question.replace(f'@{ai_id}', f'@{ai_name}')
+
     task_agent = TaskAgent.from_llm_and_tools(ai_name="Sherpa", ai_role="assistant", ai_id=bot['user_id'], memory=memory, tools=tools, previous_messages = previous_messages, llm=llm)
     return task_agent.run(question)
 

diff --git a/apps/slackbot/prompt.py b/apps/slackbot/prompt.py
@@ -92,7 +92,7 @@ def format_messages(self, **kwargs: Any) -> List[BaseMessage]:
         messages: List[BaseMessage] = [base_prompt, time_prompt]
         messages += historical_messages
         messages.append(input_message)
-        print("all_prompt:", previous_messages)
+        print("all_prompt:", messages)
         return messages
 
     def process_chat_history(self, messages: List[dict]) -> List[BaseMessage]:

diff --git a/apps/slackbot/task_agent.py b/apps/slackbot/task_agent.py
@@ -88,7 +88,7 @@ def from_llm_and_tools(
 
     def run(self, task: str) -> str:
         user_input = (
-            "Determine which next command to use, "
+            "Determine which next command to use. "
             "and respond using the JSON format specified above without any extra text."
             "\n JSON Response: \n"
         )
@@ -105,9 +105,9 @@ def run(self, task: str) -> str:
 
             if loop_count >= self.max_iterations:
                 user_input = (
-                    "Use information gathered above to finish the task."
-                    "if the tool used is Search Tool, create inline citation at the of the sentence that use the result of the Search Tool"
-                    "Give a number of citation and put the link from result of a search tool at each inline citation"
+                     f"Use the above information to respond to the user's message:\n{task}\n\n"
+                    f"If you use any resource, then create inline citation by adding the source link of the reference document at the of the sentence."
+                    f"Only use the link given in the reference document. DO NOT create link by yourself. DO NOT include citation if the resource is not necessary. "
                     "only write text but not the JSON format specified above. \nResult:"
                 )
 
@@ -135,7 +135,7 @@ def run(self, task: str) -> str:
                     return assistant_reply
                 return result["command"]["args"]["response"]
 
-
+            
             # Get command name and arguments
             action = self.output_parser.parse(assistant_reply)
             print("action:", action)

diff --git a/apps/slackbot/tools.py b/apps/slackbot/tools.py
@@ -4,10 +4,10 @@
 import requests
 from bs4 import BeautifulSoup
 from langchain.utilities import GoogleSerperAPIWrapper
-from langchain.docstore.document import Document
 from langchain.prompts import Prompt
 from langchain.vectorstores.base import VectorStoreRetriever
 from typing_extensions import Literal
+import os
 
 
 def get_tools(memory):
@@ -16,36 +16,24 @@ def get_tools(memory):
         "in the instruction.\nTask: {input}\nResult: "
     )
     prompt = Prompt.from_template(prompt)
-    # llm_chain = LLMChain(llm=llm, prompt=prompt)
-    search_tool = SearchTool(api_wrapper=GoogleSerperAPIWrapper())
-    # llm_tool = LLMTool(llm_chain=llm_chain)
+    tools = []
 
-    # user_input_tool = UserInputTool()
-    context_tool = ContextTool(memory=memory)
+    tools.append(ContextTool(memory=memory))
 
-    return [search_tool, context_tool]
+    if os.environ.get("SERPER_API_KEY", False):
+        search_tool = SearchTool(api_wrapper=GoogleSerperAPIWrapper())
+        tools.append(search_tool)
+    else:
+        print("No SERPER_API_KEY found in environment variables, skipping SearchTool")
 
-
-class ScrapeTool(BaseTool):
-    name = "Scrape"
-    description = "A tool for scraping a website for information."
-    chunk_size = 200
-
-    def _run(self, url: str) -> str:
-        response = requests.get(url)
-        soup = BeautifulSoup(response.content, "html.parser")
-        data = soup.get_text(strip=True)
-
-        return data
-
-    def _arun(self, *args: Any, **kwargs: Any):
-        raise NotImplementedError("ScrapeTool does not support async run")
+    return tools
 
 
 class SearchTool(BaseTool):
     name = "Search"
     description = (
-        "Access the internet to search for the information, the input is a search query"
+        "Access the internet to search for the information, only use this tool when "
+        "you cannot find the informaiton using internal search."
     )
     api_wrapper: GoogleSerperAPIWrapper
 
@@ -54,7 +42,7 @@ def _run(self, query: str) -> str:
         search_results = google_serper._google_serper_api_results(query)
 
         # case 1: answerBox in the result dictionary
-        if search_results.get("answerBox"):
+        if search_results.get("answerBox", False):
             answer_box = search_results.get("answerBox", {})
             if answer_box.get("answer"):
                 answer = answer_box.get("answer")
@@ -69,7 +57,7 @@ def _run(self, query: str) -> str:
 
       # case 2: knowledgeGraph in the result dictionary
         snippets = []
-        if search_results.get("knowledgeGraph"):
+        if search_results.get("knowledgeGraph", False):
             kg = search_results.get("knowledgeGraph", {})
             title = kg.get("title")
             entity_type = kg.get("type")
@@ -106,32 +94,19 @@ def _run(self, query: str) -> str:
         full_result = "\n".join(result)
 
         # answer = " ".join(snippets)
-        answer = "Description: " + search_results["knowledgeGraph"]['title'] + search_results["knowledgeGraph"]['description'] + "\nLink: " + search_results["knowledgeGraph"]['descriptionLink']
-        return answer + full_result
+        if 'knowledgeGraph' in search_results:
+            answer = "Description: " + search_results["knowledgeGraph"]['title'] + search_results["knowledgeGraph"]['description'] + "\nLink: " + search_results["knowledgeGraph"]['descriptionLink']
+            full_result = answer + "\n" + full_result
+        return full_result
 
     def _arun(self, query: str) -> str:
         raise NotImplementedError("SearchTool does not support async run")
 
 
-class LLMTool(BaseTool):
-    name = "LLM"
-    description = (
-        "Access the LLM to perform different tasks"
-    )
-    llm_chain: LLMChain
-
-    def _run(self, query: str) -> str:
-        return self.llm_chain.run(input=query)
-
-    def _arun(self, query: str) -> str:
-        raise NotImplementedError("LLMTool does not support async run")
-
-
 class ContextTool(BaseTool):
-    name = "Context"
+    name = "Context Search"
     description = (
-        "Access the read-only domain specific internal documents for the task."
-        "You use this tool if you need further clarification of the task."
+        "Access internal documents for various information."
     )
     memory: VectorStoreRetriever
 
@@ -140,7 +115,7 @@ def _run(self, query: str) -> str:
         result = ""
         for doc in docs:
             result += "Document" + doc.page_content + "\nLink" + doc.metadata.get("source", "")
-
+        
         return result
 
     def _arun(self, query: str) -> str:

diff --git a/apps/slackbot/utils.py b/apps/slackbot/utils.py
@@ -0,0 +1,19 @@
+from typing import List
+from langchain.docstore.document import Document
+from langchain.document_loaders import UnstructuredPDFLoader, UnstructuredMarkdownLoader
+
+
+def load_files(files: List[str]) -> List[Document]:
+    documents = []
+    for f in files:
+        print(f'Loading file {f}')
+        if f.endswith(".pdf"):
+            loader = UnstructuredPDFLoader(f)
+        elif f.endswith(".md"):
+            loader = UnstructuredMarkdownLoader(f)
+        else:
+            raise NotImplementedError(f"File type {f} not supported")
+        documents.extend(loader.load())
+
+    print(documents)
+    return documents
diff --git a/apps/slackbot/vectorstores.py b/apps/slackbot/vectorstores.py
@@ -4,13 +4,14 @@
 import pinecone
 import os
 import uuid
-from typing import Any, Iterable, List, Optional, Type
+from typing import Any, Iterable, List, Optional, Tuple, Type
 from langchain.docstore.document import Document
 import logging
 from langchain.vectorstores.base import VectorStoreRetriever
-from langchain.document_loaders import UnstructuredPDFLoader
+from langchain.vectorstores import Chroma
 from langchain.indexes import VectorstoreIndexCreator
 from langchain.text_splitter import CharacterTextSplitter
+from utils import load_files
 
 
 PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
@@ -69,6 +70,31 @@ def similarity_search(
             if res["score"] > threshold:
                 docs.append(Document(page_content=text, metadata=metadata))
         return docs
+
+    def _similarity_search_with_relevance_scores(
+        self,
+        query: str,
+        k: int = 4,
+        **kwargs: Any,
+    ) -> List[Tuple[Document, float]]:
+        print("query", query)
+        query_embedding = self.embeddings.embed_query(query)
+        results = self.db.query(
+            [query_embedding],
+            top_k=k,
+            include_metadata=True,
+            namespace=self.namespace,
+            filter=kwargs.get("filter", None),
+        )
+
+        docs_with_score = []
+        for res in results["matches"]:
+            metadata = res["metadata"]
+            text = metadata.pop(self.text_key)
+            docs_with_score.append((Document(page_content=text, metadata=metadata), res["score"]))
+        print(docs_with_score)
+        return docs_with_score
+
 
     @classmethod
     def delete(cls, namespace, index_name):
@@ -78,26 +104,36 @@ def delete(cls, namespace, index_name):
 
     @classmethod
     def get_vector_retrieval(
-        cls, namespace: str, openai_api_key: str, index_name: str
+        cls, namespace: str, openai_api_key: str, index_name: str, search_type='similarity', search_kwargs={}
     ) -> VectorStoreRetriever:
         vectorstore = cls.from_index(namespace, openai_api_key, index_name)
-        retriever = VectorStoreRetriever(vectorstore=vectorstore)
+        retriever = VectorStoreRetriever(vectorstore=vectorstore, search_type=search_type, 
+                                         search_kwargs=search_kwargs)
         return retriever
 
     @classmethod
     def from_texts(cls, texts: List[str], embedding: Embeddings, metadatas: list[dict]):
         raise NotImplementedError("ConversationStore does not support from_texts")
+
 
-
-def get_local_db(pdf_folder_path, openai_api_key) -> VectorStoreRetriever:
-    loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
-    # loaders
-    documents = []
-    for loader in loaders:
-        documents.extend(loader.load())
-
-    index = VectorstoreIndexCreator(
-        embedding=OpenAIEmbeddings(openai_api_key=openai_api_key),
-        text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)
+class LocalChromaStore(Chroma):
 
-    return index.vectorstore.as_retriever()
+    @classmethod
+    def from_folder(cls, file_path, openai_api_key, index_name='chroma'):
+        """
+        Create a Chroma DB from a folder of files (Currently only supports pdfs and markdown files)
+        file_path: path to the folder
+        openai_api_key: openai api key
+        index_name: name of the index
+        """
+        files = os.listdir(file_path)
+        files = [file_path + '/' + file for file in files]
+        documents = load_files(files)
+
+        embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
+        chroma = cls(index_name, embeddings)
+        test_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+        documents = test_splitter.split_documents(documents)
+        print('adding documents')
+        chroma.add_documents(documents)
+        return chroma