Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Citation fix and local sandbox #17

Merged
merged 5 commits into from
Jul 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 20 additions & 23 deletions apps/slackbot/bolt_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,11 @@
from flask import Flask, request
load_dotenv()
from langchain.chat_models import ChatOpenAI
from langchain import LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from os import environ
from vectorstores import ConversationStore
from prompt import SlackBotPrompt
from vectorstores import ConversationStore, LocalChromaStore
from slack_bolt import App
from slack_bolt.adapter.flask import SlackRequestHandler
from langchain.agents import Tool
from tools import SearchTool, get_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from tools import get_tools
from task_agent import TaskAgent


Expand Down Expand Up @@ -135,24 +125,31 @@ def slack_events():
def hello():
return "OK"

if 'PINECONE_API_KEY'not in os.environ:
print("Warning: Pinecone API key not specified. Using local Chroma database.")
local_memory = LocalChromaStore.from_folder('files', OPENAI_KEY).as_retriever()

def get_response(question, previous_messages):
llm = ChatOpenAI(
openai_api_key=OPENAI_KEY, request_timeout=120
)

# prompt = SlackBotPrompt(
# ai_name='Sherpa',
# ai_id=bot['user_id'],
# token_counter=llm.get_num_tokens,
# input_variables=['query', 'messages', 'retriever']
# )

memory = ConversationStore.get_vector_retrieval(
'ReadTheDocs', OPENAI_KEY, index_name=os.getenv("PINECONE_INDEX")
)
if os.environ.get("PINECONE_API_KEY", False):
# If pinecone API is specified, then use the Pinecone Database
memory = ConversationStore.get_vector_retrieval(
'ReadTheDocs', OPENAI_KEY, index_name=os.getenv("PINECONE_INDEX"), search_type='similarity_score_threshold', search_kwargs={'score_threshold': 0.0}
)
else:
# use the local Chroma database
memory = local_memory


tools=get_tools(memory)

ai_name='Sherpa'

ai_id = bot['user_id']
question = question.replace(f'@{ai_id}', f'@{ai_name}')

task_agent = TaskAgent.from_llm_and_tools(ai_name="Sherpa", ai_role="assistant", ai_id=bot['user_id'], memory=memory, tools=tools, previous_messages = previous_messages, llm=llm)
return task_agent.run(question)

Expand Down
2 changes: 1 addition & 1 deletion apps/slackbot/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def format_messages(self, **kwargs: Any) -> List[BaseMessage]:
messages: List[BaseMessage] = [base_prompt, time_prompt]
messages += historical_messages
messages.append(input_message)
print("all_prompt:", previous_messages)
print("all_prompt:", messages)
return messages

def process_chat_history(self, messages: List[dict]) -> List[BaseMessage]:
Expand Down
10 changes: 5 additions & 5 deletions apps/slackbot/task_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def from_llm_and_tools(

def run(self, task: str) -> str:
user_input = (
"Determine which next command to use, "
"Determine which next command to use. "
"and respond using the JSON format specified above without any extra text."
"\n JSON Response: \n"
)
Expand All @@ -105,9 +105,9 @@ def run(self, task: str) -> str:

if loop_count >= self.max_iterations:
user_input = (
"Use information gathered above to finish the task."
"if the tool used is Search Tool, create inline citation at the of the sentence that use the result of the Search Tool"
"Give a number of citation and put the link from result of a search tool at each inline citation"
f"Use the above information to respond to the user's message:\n{task}\n\n"
f"If you use any resource, then create inline citation by adding the source link of the reference document at the of the sentence."
f"Only use the link given in the reference document. DO NOT create link by yourself. DO NOT include citation if the resource is not necessary. "
"only write text but not the JSON format specified above. \nResult:"
)

Expand Down Expand Up @@ -135,7 +135,7 @@ def run(self, task: str) -> str:
return assistant_reply
return result["command"]["args"]["response"]


# Get command name and arguments
action = self.output_parser.parse(assistant_reply)
print("action:", action)
Expand Down
65 changes: 20 additions & 45 deletions apps/slackbot/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import requests
from bs4 import BeautifulSoup
from langchain.utilities import GoogleSerperAPIWrapper
from langchain.docstore.document import Document
from langchain.prompts import Prompt
from langchain.vectorstores.base import VectorStoreRetriever
from typing_extensions import Literal
import os


def get_tools(memory):
Expand All @@ -16,36 +16,24 @@ def get_tools(memory):
"in the instruction.\nTask: {input}\nResult: "
)
prompt = Prompt.from_template(prompt)
# llm_chain = LLMChain(llm=llm, prompt=prompt)
search_tool = SearchTool(api_wrapper=GoogleSerperAPIWrapper())
# llm_tool = LLMTool(llm_chain=llm_chain)
tools = []

# user_input_tool = UserInputTool()
context_tool = ContextTool(memory=memory)
tools.append(ContextTool(memory=memory))

return [search_tool, context_tool]
if os.environ.get("SERPER_API_KEY", False):
search_tool = SearchTool(api_wrapper=GoogleSerperAPIWrapper())
tools.append(search_tool)
else:
print("No SERPER_API_KEY found in environment variables, skipping SearchTool")


class ScrapeTool(BaseTool):
name = "Scrape"
description = "A tool for scraping a website for information."
chunk_size = 200

def _run(self, url: str) -> str:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
data = soup.get_text(strip=True)

return data

def _arun(self, *args: Any, **kwargs: Any):
raise NotImplementedError("ScrapeTool does not support async run")
return tools


class SearchTool(BaseTool):
name = "Search"
description = (
"Access the internet to search for the information, the input is a search query"
"Access the internet to search for the information, only use this tool when "
"you cannot find the informaiton using internal search."
)
api_wrapper: GoogleSerperAPIWrapper

Expand All @@ -54,7 +42,7 @@ def _run(self, query: str) -> str:
search_results = google_serper._google_serper_api_results(query)

# case 1: answerBox in the result dictionary
if search_results.get("answerBox"):
if search_results.get("answerBox", False):
answer_box = search_results.get("answerBox", {})
if answer_box.get("answer"):
answer = answer_box.get("answer")
Expand All @@ -69,7 +57,7 @@ def _run(self, query: str) -> str:

# case 2: knowledgeGraph in the result dictionary
snippets = []
if search_results.get("knowledgeGraph"):
if search_results.get("knowledgeGraph", False):
kg = search_results.get("knowledgeGraph", {})
title = kg.get("title")
entity_type = kg.get("type")
Expand Down Expand Up @@ -106,32 +94,19 @@ def _run(self, query: str) -> str:
full_result = "\n".join(result)

# answer = " ".join(snippets)
answer = "Description: " + search_results["knowledgeGraph"]['title'] + search_results["knowledgeGraph"]['description'] + "\nLink: " + search_results["knowledgeGraph"]['descriptionLink']
return answer + full_result
if 'knowledgeGraph' in search_results:
answer = "Description: " + search_results["knowledgeGraph"]['title'] + search_results["knowledgeGraph"]['description'] + "\nLink: " + search_results["knowledgeGraph"]['descriptionLink']
full_result = answer + "\n" + full_result
return full_result

def _arun(self, query: str) -> str:
raise NotImplementedError("SearchTool does not support async run")


class LLMTool(BaseTool):
name = "LLM"
description = (
"Access the LLM to perform different tasks"
)
llm_chain: LLMChain

def _run(self, query: str) -> str:
return self.llm_chain.run(input=query)

def _arun(self, query: str) -> str:
raise NotImplementedError("LLMTool does not support async run")


class ContextTool(BaseTool):
name = "Context"
name = "Context Search"
description = (
"Access the read-only domain specific internal documents for the task."
"You use this tool if you need further clarification of the task."
"Access internal documents for various information."
)
memory: VectorStoreRetriever

Expand All @@ -140,7 +115,7 @@ def _run(self, query: str) -> str:
result = ""
for doc in docs:
result += "Document" + doc.page_content + "\nLink" + doc.metadata.get("source", "")

return result

def _arun(self, query: str) -> str:
Expand Down
19 changes: 19 additions & 0 deletions apps/slackbot/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders import UnstructuredPDFLoader, UnstructuredMarkdownLoader


def load_files(files: List[str]) -> List[Document]:
documents = []
for f in files:
print(f'Loading file {f}')
if f.endswith(".pdf"):
loader = UnstructuredPDFLoader(f)
elif f.endswith(".md"):
loader = UnstructuredMarkdownLoader(f)
else:
raise NotImplementedError(f"File type {f} not supported")
documents.extend(loader.load())

print(documents)
return documents
68 changes: 52 additions & 16 deletions apps/slackbot/vectorstores.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
import pinecone
import os
import uuid
from typing import Any, Iterable, List, Optional, Type
from typing import Any, Iterable, List, Optional, Tuple, Type
from langchain.docstore.document import Document
import logging
from langchain.vectorstores.base import VectorStoreRetriever
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import Chroma
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from utils import load_files


PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
Expand Down Expand Up @@ -69,6 +70,31 @@ def similarity_search(
if res["score"] > threshold:
docs.append(Document(page_content=text, metadata=metadata))
return docs

def _similarity_search_with_relevance_scores(
self,
query: str,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
print("query", query)
query_embedding = self.embeddings.embed_query(query)
results = self.db.query(
[query_embedding],
top_k=k,
include_metadata=True,
namespace=self.namespace,
filter=kwargs.get("filter", None),
)

docs_with_score = []
for res in results["matches"]:
metadata = res["metadata"]
text = metadata.pop(self.text_key)
docs_with_score.append((Document(page_content=text, metadata=metadata), res["score"]))
print(docs_with_score)
return docs_with_score


@classmethod
def delete(cls, namespace, index_name):
Expand All @@ -78,26 +104,36 @@ def delete(cls, namespace, index_name):

@classmethod
def get_vector_retrieval(
cls, namespace: str, openai_api_key: str, index_name: str
cls, namespace: str, openai_api_key: str, index_name: str, search_type='similarity', search_kwargs={}
) -> VectorStoreRetriever:
vectorstore = cls.from_index(namespace, openai_api_key, index_name)
retriever = VectorStoreRetriever(vectorstore=vectorstore)
retriever = VectorStoreRetriever(vectorstore=vectorstore, search_type=search_type,
search_kwargs=search_kwargs)
return retriever

@classmethod
def from_texts(cls, texts: List[str], embedding: Embeddings, metadatas: list[dict]):
raise NotImplementedError("ConversationStore does not support from_texts")



def get_local_db(pdf_folder_path, openai_api_key) -> VectorStoreRetriever:
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
# loaders
documents = []
for loader in loaders:
documents.extend(loader.load())

index = VectorstoreIndexCreator(
embedding=OpenAIEmbeddings(openai_api_key=openai_api_key),
text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)
class LocalChromaStore(Chroma):

return index.vectorstore.as_retriever()
@classmethod
def from_folder(cls, file_path, openai_api_key, index_name='chroma'):
"""
Create a Chroma DB from a folder of files (Currently only supports pdfs and markdown files)
file_path: path to the folder
openai_api_key: openai api key
index_name: name of the index
"""
files = os.listdir(file_path)
files = [file_path + '/' + file for file in files]
documents = load_files(files)

embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
chroma = cls(index_name, embeddings)
test_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = test_splitter.split_documents(documents)
print('adding documents')
chroma.add_documents(documents)
return chroma