diff --git a/apps/.gitignore b/apps/.gitignore new file mode 100644 index 00000000..4863d269 --- /dev/null +++ b/apps/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +**/.env \ No newline at end of file diff --git a/apps/slackbot/Dockerfile b/apps/slackbot/Dockerfile index 683841b3..871ced36 100644 --- a/apps/slackbot/Dockerfile +++ b/apps/slackbot/Dockerfile @@ -21,4 +21,4 @@ COPY . . EXPOSE 80 # Run the Flask app -CMD ["python", "app.py"] +CMD ["python", "bolt_app.py"] diff --git a/apps/slackbot/README.md b/apps/slackbot/README.md index 03666e6f..8d91228d 100644 --- a/apps/slackbot/README.md +++ b/apps/slackbot/README.md @@ -29,9 +29,12 @@ This repository contains a chatbot implementation using Flask and Slack. The cha . All these tokens should be added in .env file SLACK_SIGNING_SECRET: Slack apps signing secret. - SLACK_BOT_TOKEN: Slack bot token for authentication. + SLACK_OAUTH_TOKEN: Slack bot token for authentication. VERIFICATION_TOKEN: Slack verification token. OPENAI_API_KEY: OpenAI API key for language modeling. + PINECONE_INDEX: The Pinecone vector database index + PINECONE_API_KEY: The Pinecone vector database API key + PINECONE_ENV: Region where the Pinecone index is deployed All these tokens should be added in .env file @@ -47,6 +50,8 @@ This repository contains a chatbot implementation using Flask and Slack. The cha 2. Expose the server to the internet using a tool like ngrok. Not required in hosted on public IP 3. Set up the Slack app's Event Subscriptions and provide the ngrok URL as the Request URL. + * **NOTE:** When add the url to the Slack app, make sure to append `/slack/events` at the end as this is the default path used by Slack Bolt. + # Reference diff --git a/apps/slackbot/app.py b/apps/slackbot/app.py index 006913fc..c33e0441 100644 --- a/apps/slackbot/app.py +++ b/apps/slackbot/app.py @@ -28,6 +28,8 @@ import atexit load_dotenv() +from vectorstores import get_local_db + # This `app` represents your existing Flask app app = Flask(__name__) @@ -176,20 +178,12 @@ def createIndex(pdf_folder_path): global loaders global chain global index - loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)] - # loaders - documents = [] - for loader in loaders: - documents.extend(loader.load()) - - index = VectorstoreIndexCreator( - embedding=OpenAIEmbeddings(openai_api_key=OPENAI_KEY), - text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders) + retrival = get_local_db(pdf_folder_path, OPENAI_KEY) llm = OpenAI(model_name="gpt-3.5-turbo", openai_api_key=OPENAI_KEY) chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", - retriever=index.vectorstore.as_retriever(), + retrieve=retrival, input_key="question") return chain diff --git a/apps/slackbot/bolt_app.py b/apps/slackbot/bolt_app.py new file mode 100644 index 00000000..27a376d6 --- /dev/null +++ b/apps/slackbot/bolt_app.py @@ -0,0 +1,153 @@ +############################################## +# Implementation of the slack app using Bolt +# Importing necessary modules +############################################## + +import os +from dotenv import load_dotenv +load_dotenv() +from langchain.chat_models import ChatOpenAI +from langchain import LLMChain +from langchain.chains.question_answering import load_qa_chain +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.text_splitter import CharacterTextSplitter +from langchain.vectorstores import FAISS +from langchain.llms import OpenAI +from os import environ +from vectorstores import ConversationStore +from prompt import SlackBotPrompt +from slack_bolt import App + + + +# This `app` represents your existing Flask app +app = App( + token=os.environ.get("SLACK_OAUTH_TOKEN"), + signing_secret=os.environ.get("SLACK_SIGNING_SECRET"), +) + + +##################################################################################################### +# Setting up environment variables and Slack configuration: +# The code retrieves various environment variables using os.environ.get() method. +# Environment variables include Slack signing secret, OAuth token, verification token, and OpenAI key. +##################################################################################################### + +SLACK_SIGNING_SECRET = environ.get("SLACK_SIGNING_SECRET") +SLACK_OAUTH_TOKEN = environ.get("SLACK_OAUTH_TOKEN") +VERIFICATION_TOKEN = environ.get("VERIFICATION_TOKEN") +OPENAI_KEY=environ.get("OPENAI_KEY") + + + +########################################################################### +# Instantiating Slack client and Flask app: +########################################################################### + +#instantiating slack client +os.environ['OPENAI_API_KEY'] = OPENAI_KEY + +@app.command("/hello-socket-mode") +def hello_command(ack, body): + user_id = body["user_id"] + ack(f"Hi, <@{user_id}>!") + +bot = app.client.auth_test() +print(bot) + +@app.event("app_mention") +def event_test(client, say, event): + question = event['text'] + + thread_ts = event.get("thread_ts", None) or event["ts"] + replies = client.conversations_replies(channel=event['channel'], ts=thread_ts) + previous_messages = replies['messages'][:-1] + + results = get_response(question, previous_messages) + + say(results, thread_ts=thread_ts) + +@app.event("app_home_opened") +def update_home_tab(client, event, logger): + try: + # views.publish is the method that your app uses to push a view to the Home tab + client.views_publish( + # the user that opened your app's app home + user_id=event["user"], + # the view object that appears in the app home + view={ + "type": "home", + "callback_id": "home_view", + + # body of the view + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*Welcome to your _App's Home_* :tada:" + } + }, + { + "type": "divider" + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "This button won't do much for now but you can set up a listener for it using the `actions()` method and passing its unique `action_id`. See an example in the `examples` folder within your Bolt app." + } + }, + { + "type": "actions", + "elements": [ + { + "type": "button", + "text": { + "type": "plain_text", + "text": "Click me!" + } + } + ] + } + ] + } + ) + + except Exception as e: + logger.error(f"Error publishing home tab: {e}") + +def get_response(question, previous_messages): + llm = ChatOpenAI( + openai_api_key=OPENAI_KEY, request_timeout=120 + ) + + prompt = SlackBotPrompt( + ai_name='Sherpa', + ai_id=bot['user_id'], + token_counter=llm.get_num_tokens, + input_variables=['query', 'messages', 'retriever'] + ) + + retriever = ConversationStore.get_vector_retrieval( + 'ReadTheDocs', OPENAI_KEY, index_name=os.getenv("PINECONE_INDEX") + ) + + chain = LLMChain(llm=llm, prompt=prompt) + + return chain.run( + query=question, + messages=previous_messages, + retriever=retriever, + ) + +# Start the server on port 3000 +if __name__ == "__main__": + # documents = getDocuments('files') + # vectorstore = getVectoreStore(documents) + # qa = createLangchainQA(vectorstore) + + # chain = createIndex("files") + print('Running the app') + app.start() + # SocketModeHandler(app, os.environ["SLACK_APP_TOKEN"]).start() diff --git a/apps/slackbot/prompt.py b/apps/slackbot/prompt.py new file mode 100644 index 00000000..4a0e70c7 --- /dev/null +++ b/apps/slackbot/prompt.py @@ -0,0 +1,104 @@ +from pydantic import BaseModel +from langchain.prompts.chat import BaseChatPromptTemplate +from typing import Callable, Any, List +from langchain.schema import ( + BaseMessage, + HumanMessage, + SystemMessage, + AIMessage +) +import time +from langchain.vectorstores.base import VectorStoreRetriever + + + +class SlackBotPrompt(BaseChatPromptTemplate, BaseModel): + ai_name: str + ai_id: str + token_counter: Callable[[str], int] + send_token_limit: int = 4196 + + def construct_base_prompt(self): + full_prompt = ( + f"You are a friendly assistent bot called {self.ai_name}\n\n" + ) + + return full_prompt + + def format_messages(self, **kwargs: Any) -> List[BaseMessage]: + base_prompt = SystemMessage( + content=self.construct_base_prompt() + ) + time_prompt = SystemMessage( + content=f"The current time and date is {time.strftime('%c')}" + ) + used_tokens = self.token_counter(base_prompt.content) + self.token_counter( + time_prompt.content + ) + + query = kwargs["query"] + retriever: VectorStoreRetriever = kwargs["retriever"] + previous_messages = self.process_chat_history(kwargs["messages"]) + + # retrieve relevant documents for the query + relevant_docs = retriever.get_relevant_documents(query) + relevant_memory = ["Document: " + d.page_content + "\nLink" + d.metadata["source"] + "\n" for d in relevant_docs] + + # remove documents from memory until the token limit is reached + relevant_memory_tokens = sum( + [self.token_counter(doc) for doc in relevant_memory] + ) + while used_tokens + relevant_memory_tokens > 2500: + relevant_memory = relevant_memory[:-1] + relevant_memory_tokens = sum( + [self.token_counter(doc) for doc in relevant_memory] + ) + + content_format = ( + f"Here are some documents that may be relevant to the topic:" + f"\n{relevant_memory}\n\n" + ) + + input_message = ( + f"Use the above information to respond to the user's message:\n{query}\n\n" + f"create inline citation by adding the source link of the reference document at the of the sentence." + f"Only use the link given in the reference document. DO NOT create link by yourself." + ) + + # print(content_format) + + memory_message = SystemMessage(content=content_format) + used_tokens += self.token_counter(memory_message.content) + historical_messages: List[BaseMessage] = [] + print(previous_messages) + for message in previous_messages[-10:][::-1]: + message_tokens = self.token_counter(message.content) + if used_tokens + message_tokens > self.send_token_limit - 1000: + break + historical_messages = [message] + historical_messages + used_tokens += message_tokens + print(historical_messages) + + input_message = HumanMessage(content=input_message) + + messages: List[BaseMessage] = [base_prompt, time_prompt, memory_message] + messages += historical_messages + messages.append(input_message) + + return messages + + def process_chat_history(self, messages: List[dict]) -> List[BaseMessage]: + results = [] + + for message in messages: + print(message) + if message['type'] != 'message' and message['type'] != 'text': + continue + + message_cls = AIMessage if message['user'] == self.ai_id else HumanMessage + # replace the at in the message with the name of the bot + text = message['text'].replace(f'@{self.ai_id}', f'@{self.ai_name}') + results.append(message_cls(content=text)) + + return results + diff --git a/apps/slackbot/requirements.txt b/apps/slackbot/requirements.txt index 7bba77ea..166337e5 100644 --- a/apps/slackbot/requirements.txt +++ b/apps/slackbot/requirements.txt @@ -9,4 +9,6 @@ unstructured openai chromadb tiktoken -Flask-Cors==3.0.10 \ No newline at end of file +slack_bolt +pinecone-client +Flask-Cors==3.0.10 diff --git a/apps/slackbot/vectorstores.py b/apps/slackbot/vectorstores.py new file mode 100644 index 00000000..d921bcad --- /dev/null +++ b/apps/slackbot/vectorstores.py @@ -0,0 +1,103 @@ +from langchain.embeddings import OpenAIEmbeddings +from langchain.embeddings.base import Embeddings +from langchain.vectorstores.base import VectorStore +import pinecone +import os +import uuid +from typing import Any, Iterable, List, Optional, Type +from langchain.docstore.document import Document +import logging +from langchain.vectorstores.base import VectorStoreRetriever +from langchain.document_loaders import UnstructuredPDFLoader +from langchain.indexes import VectorstoreIndexCreator +from langchain.text_splitter import CharacterTextSplitter + + +PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") +PINECONE_ENV = os.environ.get("PINECONE_ENV") + + +class ConversationStore(VectorStore): + def __init__(self, namespace, db, embeddings, text_key): + self.db = db + self.namespace = namespace + self.embeddings = embeddings + self.text_key = text_key + + @classmethod + def from_index(cls, namespace, openai_api_key, index_name, text_key="text"): + pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV) + logging.info(f"Loading index {index_name} from Pinecone") + index = pinecone.Index(index_name) + embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) + return cls(namespace, index, embeddings, text_key) + + def add_text(self, text: str, metadata={}) -> str: + metadata[self.text_key] = text + id = str(uuid.uuid4()) + embedding = self.embeddings.embed_query(text) + doc = {"id": id, "values": embedding, "metadata": metadata} + self.db.upsert(vectors=[doc], namespace=self.namespace) + + return id + + def add_texts(self, texts: Iterable[str], metadatas: List[dict]) -> List[str]: + for text, metadata in zip(texts, metadatas): + self.add_text(text, metadata) + + def similarity_search( + self, + text: str, + top_k: int = 5, + filter: Optional[dict] = None, + threshold: float = 0.7, + ) -> list[Document]: + query_embedding = self.embeddings.embed_query(text) + results = self.db.query( + [query_embedding], + top_k=top_k, + include_metadata=True, + namespace=self.namespace, + filter=filter, + ) + + # print(results) + docs = [] + for res in results["matches"]: + metadata = res["metadata"] + text = metadata.pop(self.text_key) + if res["score"] > threshold: + docs.append(Document(page_content=text, metadata=metadata)) + return docs + + @classmethod + def delete(cls, namespace, index_name): + pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV) + index = pinecone.Index(index_name) + return index.delete(delete_all=True, namespace=namespace) + + @classmethod + def get_vector_retrieval( + cls, namespace: str, openai_api_key: str, index_name: str + ) -> VectorStoreRetriever: + vectorstore = cls.from_index(namespace, openai_api_key, index_name) + retriever = VectorStoreRetriever(vectorstore=vectorstore) + return retriever + + @classmethod + def from_texts(cls, texts: List[str], embedding: Embeddings, metadatas: list[dict]): + raise NotImplementedError("ConversationStore does not support from_texts") + + +def get_local_db(pdf_folder_path, openai_api_key) -> VectorStoreRetriever: + loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)] + # loaders + documents = [] + for loader in loaders: + documents.extend(loader.load()) + + index = VectorstoreIndexCreator( + embedding=OpenAIEmbeddings(openai_api_key=openai_api_key), + text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders) + + return index.vectorstore.as_retriever() \ No newline at end of file diff --git a/scripts/Pinecone.ipynb b/scripts/Pinecone.ipynb new file mode 100644 index 00000000..76e70245 --- /dev/null +++ b/scripts/Pinecone.ipynb @@ -0,0 +1,1834 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "f50e_splyjDA" + }, + "source": [ + "# Set up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook for uploading record to vector database" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "O8Z6CAmSq5Nh", + "outputId": "f5e996ed-300a-4c9b-ab8d-36f4bbc2e3a9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting pinecone-client\n", + " Downloading pinecone_client-2.2.2-py3-none-any.whl (179 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.1/179.1 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (2.27.1)\n", + "Requirement already satisfied: pyyaml>=5.4 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (6.0)\n", + "Collecting loguru>=0.5.0 (from pinecone-client)\n", + " Downloading loguru-0.7.0-py3-none-any.whl (59 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.0/60.0 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (4.5.0)\n", + "Collecting dnspython>=2.0.0 (from pinecone-client)\n", + " Downloading dnspython-2.3.0-py3-none-any.whl (283 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m283.7/283.7 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: python-dateutil>=2.5.3 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (2.8.2)\n", + "Requirement already satisfied: urllib3>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (1.26.15)\n", + "Requirement already satisfied: tqdm>=4.64.1 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (4.65.0)\n", + "Requirement already satisfied: numpy>=1.22.0 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (1.22.4)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.5.3->pinecone-client) (1.16.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pinecone-client) (2022.12.7)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pinecone-client) (2.0.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->pinecone-client) (3.4)\n", + "Installing collected packages: loguru, dnspython, pinecone-client\n", + "Successfully installed dnspython-2.3.0 loguru-0.7.0 pinecone-client-2.2.2\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting langchain\n", + " Downloading langchain-0.0.198-py3-none-any.whl (1.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: PyYAML>=5.4.1 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.10)\n", + "Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)\n", + " Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m49.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting async-timeout<5.0.0,>=4.0.0 (from langchain)\n", + " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n", + "Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain)\n", + " Downloading dataclasses_json-0.5.8-py3-none-any.whl (26 kB)\n", + "Collecting langchainplus-sdk>=0.0.7 (from langchain)\n", + " Downloading langchainplus_sdk-0.0.9-py3-none-any.whl (21 kB)\n", + "Requirement already satisfied: numexpr<3.0.0,>=2.8.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.8.4)\n", + "Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.22.4)\n", + "Collecting openapi-schema-pydantic<2.0,>=1.2 (from langchain)\n", + " Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m90.0/90.0 kB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pydantic<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.10.7)\n", + "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.27.1)\n", + "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.2.2)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.1.0)\n", + "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (2.0.12)\n", + "Collecting multidict<7.0,>=4.5 (from aiohttp<4.0.0,>=3.8.3->langchain)\n", + " Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.5/114.5 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting yarl<2.0,>=1.0 (from aiohttp<4.0.0,>=3.8.3->langchain)\n", + " Downloading yarl-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (268 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m25.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting frozenlist>=1.1.1 (from aiohttp<4.0.0,>=3.8.3->langchain)\n", + " Downloading frozenlist-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (149 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m149.6/149.6 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting aiosignal>=1.1.2 (from aiohttp<4.0.0,>=3.8.3->langchain)\n", + " Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n", + "Collecting marshmallow<4.0.0,>=3.3.0 (from dataclasses-json<0.6.0,>=0.5.7->langchain)\n", + " Downloading marshmallow-3.19.0-py3-none-any.whl (49 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.1/49.1 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting marshmallow-enum<2.0.0,>=1.5.1 (from dataclasses-json<0.6.0,>=0.5.7->langchain)\n", + " Downloading marshmallow_enum-1.5.1-py2.py3-none-any.whl (4.2 kB)\n", + "Collecting typing-inspect>=0.4.0 (from dataclasses-json<0.6.0,>=0.5.7->langchain)\n", + " Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", + "Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<2,>=1->langchain) (4.5.0)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (1.26.15)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2022.12.7)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.4)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (2.0.2)\n", + "Requirement already satisfied: packaging>=17.0 in /usr/local/lib/python3.10/dist-packages (from marshmallow<4.0.0,>=3.3.0->dataclasses-json<0.6.0,>=0.5.7->langchain) (23.1)\n", + "Collecting mypy-extensions>=0.3.0 (from typing-inspect>=0.4.0->dataclasses-json<0.6.0,>=0.5.7->langchain)\n", + " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", + "Installing collected packages: mypy-extensions, multidict, marshmallow, frozenlist, async-timeout, yarl, typing-inspect, openapi-schema-pydantic, marshmallow-enum, langchainplus-sdk, aiosignal, dataclasses-json, aiohttp, langchain\n", + "Successfully installed aiohttp-3.8.4 aiosignal-1.3.1 async-timeout-4.0.2 dataclasses-json-0.5.8 frozenlist-1.3.3 langchain-0.0.198 langchainplus-sdk-0.0.9 marshmallow-3.19.0 marshmallow-enum-1.5.1 multidict-6.0.4 mypy-extensions-1.0.0 openapi-schema-pydantic-1.2.4 typing-inspect-0.9.0 yarl-1.9.2\n" + ] + } + ], + "source": [ + "!pip3 install pinecone-client\n", + "!pip install langchain" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hPmYHpiMyK3y", + "outputId": "c28ec1c9-58cb-4020-81ed-e21b6c62bfc0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (4.11.2)\n", + "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4) (2.4.1)\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting tiktoken\n", + " Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken) (2022.10.31)\n", + "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.10/dist-packages (from tiktoken) (2.27.1)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken) (1.26.15)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken) (2022.12.7)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken) (2.0.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken) (3.4)\n", + "Installing collected packages: tiktoken\n", + "Successfully installed tiktoken-0.4.0\n" + ] + } + ], + "source": [ + "!pip install beautifulsoup4\n", + "!pip install tiktoken" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tgI_j45SyN96", + "outputId": "233333a9-f76b-47bf-d65e-2637a6a40ad9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting openai\n", + " Downloading openai-0.27.8-py3-none-any.whl (73 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.6/73.6 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: requests>=2.20 in /usr/local/lib/python3.10/dist-packages (from openai) (2.27.1)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from openai) (4.65.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from openai) (3.8.4)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai) (1.26.15)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai) (2022.12.7)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai) (2.0.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai) (3.4)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai) (23.1.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai) (6.0.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai) (4.0.2)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai) (1.9.2)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai) (1.3.3)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai) (1.3.1)\n", + "Installing collected packages: openai\n", + "Successfully installed openai-0.27.8\n" + ] + } + ], + "source": [ + "!pip install openai" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6eEsZ4RYrcQY", + "outputId": "75c4b6de-970d-4a9c-a328-0332125a4f30" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/pinecone/index.py:4: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", + " from tqdm.autonotebook import tqdm\n" + ] + } + ], + "source": [ + "from langchain.embeddings import OpenAIEmbeddings\n", + "from langchain.embeddings.base import Embeddings\n", + "from langchain.vectorstores.base import VectorStore\n", + "import pinecone\n", + "import os\n", + "import uuid\n", + "from typing import Any, Iterable, List, Optional, Type\n", + "from langchain.docstore.document import Document\n", + "import logging\n", + "from langchain.vectorstores.base import VectorStoreRetriever" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "9zXEZ2o6yKRi" + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import ReadTheDocsLoader\n", + "from langchain.docstore import InMemoryDocstore\n", + "from langchain.embeddings import OpenAIEmbeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "bV3E6DN8yUsc" + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"OPENAI_API_KEY\"] = \"api key\"\n", + "openai_api_key = \"api key\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mmbif0CXyYgg" + }, + "source": [ + "# PINECONE" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "-YXly-LtrNsI" + }, + "outputs": [], + "source": [ + "PINECONE_INDEX=\"langchain\"\n", + "PINECONE_API_KEY=\"5b33d948-9710-42f7-8de0-1bf35a6bf54c\"\n", + "PINECONE_ENV=\"northamerica-northeast1-gcp\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "ZHieM1nbrXWg" + }, + "outputs": [], + "source": [ + "pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "FbZ-SChYrZlo" + }, + "outputs": [], + "source": [ + "active_indexes = pinecone.list_indexes()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "K25JKiQKsI6P", + "outputId": "5e81f92a-06b5-411a-d054-a3c9fd3f3cca" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['langchain']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "active_indexes" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "P75zDFfmuLaJ" + }, + "outputs": [], + "source": [ + "index = pinecone.Index(\"langchain\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iCMEVEIduReI", + "outputId": "40b862af-8193-4540-dbed-9d4f66853601" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'dimension': 1536,\n", + " 'index_fullness': 0.0,\n", + " 'namespaces': {'01662450-28d4-49b7-8484-5f27f34a2cdc': {'vector_count': 24},\n", + " '023e8e97-eea9-4538-9fd7-afad634087df': {'vector_count': 3},\n", + " '06596c06-638b-4dd2-954f-acd5e21c8231': {'vector_count': 1},\n", + " '06e6ba9b-a273-4148-99c0-1613b5f9f239': {'vector_count': 2},\n", + " '0fc4f143-60b0-44b3-9865-1e85db59299c': {'vector_count': 1},\n", + " '11873f72-c580-4fea-83fe-e4bbe72f0410': {'vector_count': 1},\n", + " '1204cb97-7bf6-4617-8897-8355fa8da366': {'vector_count': 1},\n", + " '16f0978b-32e5-4115-90e2-09690d718331': {'vector_count': 92},\n", + " '175a95ed-6317-40e6-88ed-095f4d5dc03c': {'vector_count': 6},\n", + " '195bff2d-2aa4-40e3-9a03-443347da8f61': {'vector_count': 7},\n", + " '19cd51a5-e33c-4f5f-bca9-3bca31391006': {'vector_count': 30},\n", + " '1cf26fb0-89f1-4db8-a26a-45fe1e29244a': {'vector_count': 18},\n", + " '2538d125-438c-400b-b717-e9c764047f09': {'vector_count': 4},\n", + " '29e6e154-2231-4cc9-87d4-1c4be09884c1': {'vector_count': 1},\n", + " '2c7009c2-419e-40d1-8aab-ca7270a0db6e-0-0': {'vector_count': 3},\n", + " '2f77a1a8-a5ba-4d86-be99-62caf445c1ec': {'vector_count': 2},\n", + " '345e0fd8-13cf-4a36-beb8-2e48a531b18d-0-0': {'vector_count': 3},\n", + " '34be3597-a76f-4165-a580-f5b7a537a7f1-0-0': {'vector_count': 2},\n", + " '34be3597-a76f-4165-a580-f5b7a537a7f1-1-0': {'vector_count': 1},\n", + " '35392789-c88b-485a-b80b-1f6dbf0678a3': {'vector_count': 1},\n", + " '3841e98c-8948-4c78-9adf-460b67078d61': {'vector_count': 7},\n", + " '386f8f29-f454-41f3-83d7-08dd4051e5ea': {'vector_count': 16},\n", + " '3cb4ca65-65d0-482f-a8f9-6f517bdf2ff2': {'vector_count': 2},\n", + " '3dd891ae-4466-4a78-8194-77d4e2c6fad9': {'vector_count': 1},\n", + " '3feb2f47-4f11-4d48-b836-731494b03b34': {'vector_count': 1},\n", + " '41953c44-7c42-4396-8989-f446236c1754': {'vector_count': 1},\n", + " '45145592-3f61-4f4d-adbc-1964267f7b01': {'vector_count': 1},\n", + " '463cb62c-3c9f-4c5f-afe5-1a1f6714cd1e': {'vector_count': 1},\n", + " '4640102d-71ad-4ee2-aa6b-6e0186bb2ca8': {'vector_count': 1},\n", + " '46b01f94-a33a-47c7-bc6d-4a67ad3bdbe3': {'vector_count': 55},\n", + " '49a0ae07-6c52-4b53-9ad2-27834a8b2ad8': {'vector_count': 2},\n", + " '4ace051e-a06f-41ac-8347-a0219ff13620': {'vector_count': 15},\n", + " '4b30804e-ec4e-4660-acbc-dcd1aad852d2': {'vector_count': 2},\n", + " '4f2259d0-4978-46b5-87b9-01494dbf52e6': {'vector_count': 1},\n", + " '5038f61b-94fe-4d57-8893-b940cf081d1c': {'vector_count': 1},\n", + " '51877c86-1097-4ab5-b96d-d9908f9b8c79': {'vector_count': 3},\n", + " '59418804-bd27-49d2-afd9-85232b275372': {'vector_count': 6},\n", + " '5bfca4c0-3751-4c23-a0bb-71cd3b80bba8': {'vector_count': 15},\n", + " '5d69c525-89e2-48b8-86a6-ef8489beb465-0-0': {'vector_count': 4},\n", + " '5d69c525-89e2-48b8-86a6-ef8489beb465-1-0': {'vector_count': 2},\n", + " '5e7d6969-108c-400c-8e3a-3607e666df70': {'vector_count': 7},\n", + " '5fc58d0e-a1eb-44fd-bc21-66f25313dd98': {'vector_count': 1},\n", + " '60a18b0b-cfc7-4b93-b62e-47e1e62027d1': {'vector_count': 2},\n", + " '60bfe64e-2dd8-40e4-bd7d-1e033dc3db18': {'vector_count': 10},\n", + " '62d6c8e2-babc-4f2c-9f01-76ee5e513b27': {'vector_count': 30},\n", + " '6441f86c-274c-485f-9849-2f0e30022eec': {'vector_count': 6},\n", + " '64f41368-91f0-482c-8b70-e979cbf03426': {'vector_count': 1},\n", + " '6581bd08-8768-4bc3-8dec-66f9a9a6c81f': {'vector_count': 20},\n", + " '65da3b01-0956-4298-988e-bf0da26f367a': {'vector_count': 3},\n", + " '67c5b8ae-8c48-4f18-9267-cbafec896cfd': {'vector_count': 6},\n", + " '68ba8730-914a-44b6-9a02-9a7a0d5ee3d9': {'vector_count': 6},\n", + " '74164868-8548-43fb-9f94-515c13f3103a': {'vector_count': 2},\n", + " '7561a19f-1c90-45a0-93af-2a5e57172419': {'vector_count': 2},\n", + " '77813683-3099-44e1-bfe9-7bfaedadbaf1': {'vector_count': 38},\n", + " '7d25d04c-1c0b-4869-94fb-3c1699b4a9c1-0-0': {'vector_count': 7},\n", + " '7e4b3ad0-f746-4366-b6ce-5aa45702275b': {'vector_count': 1},\n", + " '7fed1522-9761-4ce6-9553-c0597519c0cc': {'vector_count': 1},\n", + " '80ad2f67-e385-4d37-8acc-07a5ea137ef3': {'vector_count': 15},\n", + " '8dfb77d0-6bd4-4aa9-ac26-bbe52d5972c0-0-0': {'vector_count': 3},\n", + " '8ef0140a-64a5-4c5b-b564-b3a466f74e99': {'vector_count': 1},\n", + " '907ba889-b54f-4442-a376-32287b3c2147-0-0': {'vector_count': 3},\n", + " '907ba889-b54f-4442-a376-32287b3c2147-1-0': {'vector_count': 1},\n", + " '98955d2e-bdd7-4087-8471-63c851100cf7': {'vector_count': 6},\n", + " '999e9bba-2f88-4436-973b-16381a4553bc': {'vector_count': 16},\n", + " '9f835454-e2e2-4909-8331-93458c42fd39': {'vector_count': 1},\n", + " 'ReadTheDocs': {'vector_count': 3903},\n", + " 'a43d23f3-6114-4aa6-a33e-5eb9391791ec': {'vector_count': 3},\n", + " 'a5e7d04c-5914-4586-9be9-bd996cafdb21': {'vector_count': 51},\n", + " 'ad3dc281-9d3c-4504-befe-890645b8798e-0-0': {'vector_count': 2},\n", + " 'ad3dc281-9d3c-4504-befe-890645b8798e-1-0': {'vector_count': 3},\n", + " 'b1a5cd8b-d5f4-4286-94f6-a755e29e01e8': {'vector_count': 10},\n", + " 'b25e579c-acdf-4a94-83eb-f6ff0252e382': {'vector_count': 3},\n", + " 'b3624fc9-8e96-4aef-813d-b88f239ff3e1': {'vector_count': 6},\n", + " 'b3a05001-7042-4a60-bb67-23739bf39f7f': {'vector_count': 2},\n", + " 'b5906fdc-d564-4e1f-b261-c48f236eb29e': {'vector_count': 1},\n", + " 'b649e762-3578-4017-b54f-1fa98678eb94': {'vector_count': 4},\n", + " 'ba8901af-af54-4e33-a460-81903f33e3de': {'vector_count': 1},\n", + " 'bbe59a3c-0ab5-47fa-95b9-7c42628712d5': {'vector_count': 5},\n", + " 'bd9dd710-316d-4395-840f-92106686a7b5': {'vector_count': 1},\n", + " 'bdf196f9-de25-4fce-8f3b-98a7e25118ec': {'vector_count': 7},\n", + " 'be8d554e-beb7-4ea9-aac8-3fc4e28d8388': {'vector_count': 3},\n", + " 'c0085ba7-b220-4d48-8f3d-14f62c15a2f3': {'vector_count': 18},\n", + " 'c4394301-a1b6-44b4-9ffb-9e2d404819ea': {'vector_count': 1},\n", + " 'c63776e4-4496-496c-bc7c-c7df1471bcd4': {'vector_count': 30},\n", + " 'c94fabff-c774-43a8-bc34-c9663e668739': {'vector_count': 6},\n", + " 'c9e76757-5fc0-48df-8242-389e87b21dcf': {'vector_count': 7},\n", + " 'cdcd065d-6485-4411-8e8b-4c95ae6c95be': {'vector_count': 1},\n", + " 'cfb63abc-a149-4e3c-b607-1698ee46ea51-0-0': {'vector_count': 4},\n", + " 'd041c90f-b801-4aca-b91b-9e32426f48c4-0-0': {'vector_count': 3},\n", + " 'd041c90f-b801-4aca-b91b-9e32426f48c4-1-0': {'vector_count': 1},\n", + " 'd35fcc74-1d05-484b-87c9-9ee2535ce799': {'vector_count': 6},\n", + " 'd555e569-bc94-4cf1-8dab-433fc14c6668': {'vector_count': 38},\n", + " 'd7119bff-c421-4b2d-8f56-15e75dbf0ab9': {'vector_count': 1},\n", + " 'd95e8f89-5ff5-477d-a01c-d8acaa4aef08': {'vector_count': 13},\n", + " 'e00894b9-2106-459c-a4e3-885d678548ec': {'vector_count': 13},\n", + " 'e0ebdc4b-1ec3-4cba-9416-498455d8a35c': {'vector_count': 7},\n", + " 'e1ecc3db-54c1-4951-9536-b3a4a39935f8': {'vector_count': 2},\n", + " 'e1f80bf6-18ac-45ba-bd6b-280ae4b21199': {'vector_count': 20},\n", + " 'e8c7479d-cbe8-4c6d-a863-b48d5e15c81a-0-0': {'vector_count': 2},\n", + " 'e8c7479d-cbe8-4c6d-a863-b48d5e15c81a-1-0': {'vector_count': 2},\n", + " 'ecda672f-4954-4a59-8eed-e32934fd2e23': {'vector_count': 1},\n", + " 'f100853d-d41b-49d0-8827-46cd696b9624': {'vector_count': 5},\n", + " 'f1a31140-92f8-4250-bd7b-11e8d0a4e92d': {'vector_count': 4},\n", + " 'f982e6dd-f84d-40ac-ac00-0a84c03fedab': {'vector_count': 1},\n", + " 'fe1a41eb-63df-402b-9272-3fd0195b1f39': {'vector_count': 13},\n", + " 'session_id-0-0': {'vector_count': 2},\n", + " 'session_id-1-0': {'vector_count': 4},\n", + " 'session_id-1-1': {'vector_count': 3},\n", + " 'session_id-1-2': {'vector_count': 2},\n", + " 'session_id-2-0': {'vector_count': 1},\n", + " 'session_id-3-0': {'vector_count': 1},\n", + " 'session_id-3-1': {'vector_count': 1},\n", + " 'session_id-3-2': {'vector_count': 1},\n", + " 'session_id-4-0': {'vector_count': 1},\n", + " 'session_id-4-1': {'vector_count': 1},\n", + " 'session_id-4-2': {'vector_count': 1},\n", + " 'session_id-5-0': {'vector_count': 2},\n", + " 'session_id-6-0': {'vector_count': 3},\n", + " 'session_id-6-1': {'vector_count': 3},\n", + " 'session_id-7-0': {'vector_count': 3},\n", + " 'session_id-7-1': {'vector_count': 1},\n", + " 'session_id-8-0': {'vector_count': 2},\n", + " 'test-1': {'vector_count': 4}},\n", + " 'total_vector_count': 4769}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "index.describe_index_stats()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zPpYg-wRzW1N" + }, + "outputs": [], + "source": [ + "index.delete(delete_all=True, namespace=\"ReadTheDocs\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VK7_ltmFyC2J" + }, + "source": [ + "# Read the doc" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "65xCJilpgWqm" + }, + "source": [ + "### File" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "id": "bz9hZXZJgZSv" + }, + "outputs": [], + "source": [ + "\"\"\"Loader that loads ReadTheDocs documentation directory dump.\"\"\"\n", + "from pathlib import Path\n", + "from typing import Any, List, Optional, Tuple, Union\n", + "\n", + "from langchain.docstore.document import Document\n", + "from langchain.document_loaders.base import BaseLoader\n", + "\n", + "# this loader should be used if the original loader from langchain return 0 page content\n", + "class ReadTheDocsLoader_custom(BaseLoader):\n", + " \"\"\"Loader that loads ReadTheDocs documentation directory dump.\"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " path: Union[str, Path],\n", + " encoding: Optional[str] = None,\n", + " errors: Optional[str] = None,\n", + " custom_html_tag: Optional[Tuple[str, dict]] = None,\n", + " **kwargs: Optional[Any]\n", + " ):\n", + " \"\"\"\n", + " Initialize ReadTheDocsLoader\n", + "\n", + " The loader loops over all files under `path` and extract the actual content of\n", + " the files by retrieving main html tags. Default main html tags include\n", + " `
`, <`div role=\"main>`, and `
`. You\n", + " can also define your own html tags by passing custom_html_tag, e.g.\n", + " `(\"div\", \"class=main\")`. The loader iterates html tags with the order of\n", + " custom html tags (if exists) and default html tags. If any of the tags is not\n", + " empty, the loop will break and retrieve the content out of that tag.\n", + "\n", + " Args:\n", + " path: The location of pulled readthedocs folder.\n", + " encoding: The encoding with which to open the documents.\n", + " errors: Specifies how encoding and decoding errors are to be handled—this\n", + " cannot be used in binary mode.\n", + " custom_html_tag: Optional custom html tag to retrieve the content from\n", + " files.\n", + " \"\"\"\n", + " print(\"new loader\")\n", + " try:\n", + " from bs4 import BeautifulSoup\n", + " except ImportError:\n", + " raise ImportError(\n", + " \"Could not import python packages. \"\n", + " \"Please install it with `pip install beautifulsoup4`. \"\n", + " )\n", + "\n", + " try:\n", + " _ = BeautifulSoup(\n", + " \"Parser builder library test.\", **kwargs\n", + " )\n", + " except Exception as e:\n", + " raise ValueError(\"Parsing kwargs do not appear valid\") from e\n", + "\n", + " self.file_path = Path(path)\n", + " self.encoding = encoding\n", + " self.errors = errors\n", + " self.custom_html_tag = custom_html_tag\n", + " self.bs_kwargs = kwargs\n", + "\n", + " def load(self) -> List[Document]:\n", + " \"\"\"Load documents.\"\"\"\n", + " docs = []\n", + " for p in self.file_path.rglob(\"*\"):\n", + " if p.is_dir():\n", + " continue\n", + " with open(p, encoding=self.encoding, errors=self.errors) as f:\n", + " text = self._clean_data(f.read())\n", + " metadata = {\"source\": str(p)}\n", + " docs.append(Document(page_content=text, metadata=metadata))\n", + " return docs\n", + "\n", + " def _clean_data(self, data: str) -> str:\n", + " from bs4 import BeautifulSoup\n", + "\n", + " soup = BeautifulSoup(data, **self.bs_kwargs)\n", + "\n", + " # default tags\n", + " html_tags = [\n", + " (\"div\", {\"role\": \"main\"}),\n", + " (\"main\", {\"id\": \"main-content\"}),\n", + " ]\n", + " for para in soup.find_all(\"p\"):\n", + " print(para.get_text())\n", + "\n", + "\n", + " if self.custom_html_tag is not None:\n", + " html_tags.append(self.custom_html_tag)\n", + "\n", + " text = \"\"\n", + " for para in soup.find_all(\"p\"):\n", + " if para.get_text() is not None:\n", + " text = text + para.get_text()\n", + "\n", + " # reversed order. check the custom one first\n", + " # for tag, attrs in html_tags[::-1]:\n", + " # text = soup.find(tag, attrs)\n", + " # # if found, break\n", + " # if text is not None:\n", + " # break\n", + "\n", + " # if text is not None:\n", + " # text = text.get_text()\n", + " # else:\n", + " # text = \"\"\n", + " # trim empty lines\n", + " return \"\\n\".join([t for t in text.split(\"\\n\") if t])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6Qbi_HW_gYL_" + }, + "source": [ + "### Other" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WdqLl8A4WrKy" + }, + "outputs": [], + "source": [ + "\n", + "# Fixie https://docs.fixie.ai/ y\n", + "## LangChain https://python.langchain.com/en/latest/ y\n", + "## GPT index https://gpt-index.readthedocs.io/en/latest/ y\n", + "## GPTCache https://gptcache.readthedocs.io/en/latest/ y\n", + "## GPT4ALL https://docs.gpt4all.io/ y\n", + "\n", + "# Auto GPT https://docs.agpt.co/ y\n", + "# db-GPT https://db-gpt.readthedocs.io/en/latest/ y\n", + "# # AgentGPT https://docs.reworkd.ai/ ???\n", + "\n", + "## YOLOX https://yolox.readthedocs.io/\n", + "## https://textattack.readthedocs.io/en/master/\n", + "## https://cowrie.readthedocs.io\n", + "## https://pixellib.readthedocs.io/en/latest/\n", + "\n", + "## https://django-storages.readthedocs.io/en/latest/\n", + "## https://msal-python.readthedocs.io/en/latest/" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "c7GrYY2xz1eH", + "outputId": "fe291d1d-a762-4f60-a3c4-20d0925ec9b2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-06-13 09:30:54-- http://auto/\n", + "Resolving auto (auto)... failed: No address associated with hostname.\n", + "wget: unable to resolve host address ‘auto’\n", + "--2023-06-13 09:30:54-- http://gpt/\n", + "Resolving gpt (gpt)... failed: Name or service not known.\n", + "wget: unable to resolve host address ‘gpt’\n", + "--2023-06-13 09:30:54-- https://docs.agpt.co/\n", + "Resolving docs.agpt.co (docs.agpt.co)... 34.148.97.127, 34.74.170.74, 2600:1f18:2489:8202::c8, ...\n", + "Connecting to docs.agpt.co (docs.agpt.co)|34.148.97.127|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9391 (9.2K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/index.html’\n", + "\n", + "\rdocs.agpt.co/index. 0%[ ] 0 --.-KB/s \rdocs.agpt.co/index. 100%[===================>] 9.17K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:55 (87.5 MB/s) - ‘rtdocs/docs.agpt.co/index.html’ saved [9391/9391]\n", + "\n", + "Loading robots.txt; please ignore errors.\n", + "--2023-06-13 09:30:55-- https://docs.agpt.co/robots.txt\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 404 Not Found\n", + "2023-06-13 09:30:55 ERROR 404: Not Found.\n", + "\n", + "--2023-06-13 09:30:55-- https://docs.agpt.co/setup/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 22400 (22K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/setup/index.html’\n", + "\n", + "docs.agpt.co/setup/ 100%[===================>] 21.88K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:55 (112 MB/s) - ‘rtdocs/docs.agpt.co/setup/index.html’ saved [22400/22400]\n", + "\n", + "--2023-06-13 09:30:55-- https://docs.agpt.co/usage/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 14226 (14K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/usage/index.html’\n", + "\n", + "docs.agpt.co/usage/ 100%[===================>] 13.89K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:55 (126 MB/s) - ‘rtdocs/docs.agpt.co/usage/index.html’ saved [14226/14226]\n", + "\n", + "--2023-06-13 09:30:55-- https://docs.agpt.co/plugins/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9534 (9.3K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/plugins/index.html’\n", + "\n", + "docs.agpt.co/plugin 100%[===================>] 9.31K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:55 (143 MB/s) - ‘rtdocs/docs.agpt.co/plugins/index.html’ saved [9534/9534]\n", + "\n", + "--2023-06-13 09:30:55-- https://docs.agpt.co/configuration/search/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 11365 (11K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/configuration/search/index.html’\n", + "\n", + "docs.agpt.co/config 100%[===================>] 11.10K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:55 (158 MB/s) - ‘rtdocs/docs.agpt.co/configuration/search/index.html’ saved [11365/11365]\n", + "\n", + "--2023-06-13 09:30:55-- https://docs.agpt.co/configuration/memory/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 20678 (20K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/configuration/memory/index.html’\n", + "\n", + "docs.agpt.co/config 100%[===================>] 20.19K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:55 (125 MB/s) - ‘rtdocs/docs.agpt.co/configuration/memory/index.html’ saved [20678/20678]\n", + "\n", + "--2023-06-13 09:30:55-- https://docs.agpt.co/configuration/voice/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 10862 (11K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/configuration/voice/index.html’\n", + "\n", + "docs.agpt.co/config 100%[===================>] 10.61K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:55 (87.3 MB/s) - ‘rtdocs/docs.agpt.co/configuration/voice/index.html’ saved [10862/10862]\n", + "\n", + "--2023-06-13 09:30:55-- https://docs.agpt.co/configuration/imagegen/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 12253 (12K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/configuration/imagegen/index.html’\n", + "\n", + "docs.agpt.co/config 100%[===================>] 11.97K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:55 (115 MB/s) - ‘rtdocs/docs.agpt.co/configuration/imagegen/index.html’ saved [12253/12253]\n", + "\n", + "--2023-06-13 09:30:55-- https://docs.agpt.co/contributing/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9958 (9.7K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/contributing/index.html’\n", + "\n", + "docs.agpt.co/contri 100%[===================>] 9.72K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:55 (103 MB/s) - ‘rtdocs/docs.agpt.co/contributing/index.html’ saved [9958/9958]\n", + "\n", + "--2023-06-13 09:30:55-- https://docs.agpt.co/testing/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 10645 (10K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/testing/index.html’\n", + "\n", + "docs.agpt.co/testin 100%[===================>] 10.40K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:55 (159 MB/s) - ‘rtdocs/docs.agpt.co/testing/index.html’ saved [10645/10645]\n", + "\n", + "--2023-06-13 09:30:55-- https://docs.agpt.co/code-of-conduct/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 11808 (12K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/code-of-conduct/index.html’\n", + "\n", + "docs.agpt.co/code-o 100%[===================>] 11.53K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:55 (223 MB/s) - ‘rtdocs/docs.agpt.co/code-of-conduct/index.html’ saved [11808/11808]\n", + "\n", + "--2023-06-13 09:30:55-- https://docs.agpt.co/challenges/introduction/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 11365 (11K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/challenges/introduction/index.html’\n", + "\n", + "docs.agpt.co/challe 100%[===================>] 11.10K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:55 (89.4 MB/s) - ‘rtdocs/docs.agpt.co/challenges/introduction/index.html’ saved [11365/11365]\n", + "\n", + "--2023-06-13 09:30:55-- https://docs.agpt.co/challenges/memory/introduction/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9793 (9.6K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/challenges/memory/introduction/index.html’\n", + "\n", + "docs.agpt.co/challe 100%[===================>] 9.56K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:56 (202 MB/s) - ‘rtdocs/docs.agpt.co/challenges/memory/introduction/index.html’ saved [9793/9793]\n", + "\n", + "--2023-06-13 09:30:56-- https://docs.agpt.co/challenges/memory/challenge_a/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 10747 (10K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/challenges/memory/challenge_a/index.html’\n", + "\n", + "docs.agpt.co/challe 100%[===================>] 10.50K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:56 (222 MB/s) - ‘rtdocs/docs.agpt.co/challenges/memory/challenge_a/index.html’ saved [10747/10747]\n", + "\n", + "--2023-06-13 09:30:56-- https://docs.agpt.co/challenges/memory/challenge_b/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 11162 (11K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/challenges/memory/challenge_b/index.html’\n", + "\n", + "docs.agpt.co/challe 100%[===================>] 10.90K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:56 (254 MB/s) - ‘rtdocs/docs.agpt.co/challenges/memory/challenge_b/index.html’ saved [11162/11162]\n", + "\n", + "--2023-06-13 09:30:56-- https://docs.agpt.co/challenges/memory/challenge_c/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 11458 (11K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/challenges/memory/challenge_c/index.html’\n", + "\n", + "docs.agpt.co/challe 100%[===================>] 11.19K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:56 (193 MB/s) - ‘rtdocs/docs.agpt.co/challenges/memory/challenge_c/index.html’ saved [11458/11458]\n", + "\n", + "--2023-06-13 09:30:56-- https://docs.agpt.co/challenges/memory/challenge_d/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 13506 (13K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/challenges/memory/challenge_d/index.html’\n", + "\n", + "docs.agpt.co/challe 100%[===================>] 13.19K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:56 (88.7 MB/s) - ‘rtdocs/docs.agpt.co/challenges/memory/challenge_d/index.html’ saved [13506/13506]\n", + "\n", + "--2023-06-13 09:30:56-- https://docs.agpt.co/challenges/information_retrieval/introduction/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9694 (9.5K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/challenges/information_retrieval/introduction/index.html’\n", + "\n", + "docs.agpt.co/challe 100%[===================>] 9.47K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:56 (161 MB/s) - ‘rtdocs/docs.agpt.co/challenges/information_retrieval/introduction/index.html’ saved [9694/9694]\n", + "\n", + "--2023-06-13 09:30:56-- https://docs.agpt.co/challenges/information_retrieval/challenge_a/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 10499 (10K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/challenges/information_retrieval/challenge_a/index.html’\n", + "\n", + "docs.agpt.co/challe 100%[===================>] 10.25K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:56 (116 MB/s) - ‘rtdocs/docs.agpt.co/challenges/information_retrieval/challenge_a/index.html’ saved [10499/10499]\n", + "\n", + "--2023-06-13 09:30:56-- https://docs.agpt.co/challenges/information_retrieval/challenge_b/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 10560 (10K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/challenges/information_retrieval/challenge_b/index.html’\n", + "\n", + "docs.agpt.co/challe 100%[===================>] 10.31K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:56 (151 MB/s) - ‘rtdocs/docs.agpt.co/challenges/information_retrieval/challenge_b/index.html’ saved [10560/10560]\n", + "\n", + "--2023-06-13 09:30:56-- https://docs.agpt.co/challenges/submit/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 10283 (10K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/challenges/submit/index.html’\n", + "\n", + "docs.agpt.co/challe 100%[===================>] 10.04K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:56 (242 MB/s) - ‘rtdocs/docs.agpt.co/challenges/submit/index.html’ saved [10283/10283]\n", + "\n", + "--2023-06-13 09:30:56-- https://docs.agpt.co/challenges/beat/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9570 (9.3K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/challenges/beat/index.html’\n", + "\n", + "docs.agpt.co/challe 100%[===================>] 9.35K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:56 (244 MB/s) - ‘rtdocs/docs.agpt.co/challenges/beat/index.html’ saved [9570/9570]\n", + "\n", + "--2023-06-13 09:30:56-- https://docs.agpt.co/challenges/list/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 8759 (8.6K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/challenges/list/index.html’\n", + "\n", + "docs.agpt.co/challe 100%[===================>] 8.55K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:56 (195 MB/s) - ‘rtdocs/docs.agpt.co/challenges/list/index.html’ saved [8759/8759]\n", + "\n", + "--2023-06-13 09:30:56-- https://docs.agpt.co/challenges/challenge_template/\n", + "Reusing existing connection to docs.agpt.co:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9298 (9.1K) [text/html]\n", + "Saving to: ‘rtdocs/docs.agpt.co/challenges/challenge_template/index.html’\n", + "\n", + "docs.agpt.co/challe 100%[===================>] 9.08K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:56 (210 MB/s) - ‘rtdocs/docs.agpt.co/challenges/challenge_template/index.html’ saved [9298/9298]\n", + "\n", + "FINISHED --2023-06-13 09:30:56--\n", + "Total wall clock time: 1.9s\n", + "Downloaded: 24 files, 273K in 0.002s (137 MB/s)\n", + "--2023-06-13 09:30:56-- http://db-gpt/\n", + "Resolving db-gpt (db-gpt)... failed: Name or service not known.\n", + "wget: unable to resolve host address ‘db-gpt’\n", + "--2023-06-13 09:30:56-- https://db-gpt.readthedocs.io/en/latest/\n", + "Resolving db-gpt.readthedocs.io (db-gpt.readthedocs.io)... 104.17.32.82, 104.17.33.82, 2606:4700::6811:2052, ...\n", + "Connecting to db-gpt.readthedocs.io (db-gpt.readthedocs.io)|104.17.32.82|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/index.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 22.75K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:57 (73.3 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/index.html’ saved [23297]\n", + "\n", + "Loading robots.txt; please ignore errors.\n", + "--2023-06-13 09:30:57-- https://db-gpt.readthedocs.io/robots.txt\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 96 [text/plain]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/robots.txt.tmp’\n", + "\n", + "db-gpt.readthedocs. 100%[===================>] 96 --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:57 (8.21 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/robots.txt.tmp’ saved [96/96]\n", + "\n", + "--2023-06-13 09:30:57-- https://db-gpt.readthedocs.io/en/latest/index.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/index.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 22.75K --.-KB/s in 0.001s \n", + "\n", + "2023-06-13 09:30:57 (15.2 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/index.html’ saved [23297]\n", + "\n", + "--2023-06-13 09:30:57-- https://db-gpt.readthedocs.io/en/latest/genindex.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/genindex.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 12.36K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:58 (41.2 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/genindex.html’ saved [12656]\n", + "\n", + "--2023-06-13 09:30:58-- https://db-gpt.readthedocs.io/en/latest/search.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/search.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 13.13K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:58 (55.5 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/search.html’ saved [13446]\n", + "\n", + "--2023-06-13 09:30:58-- https://db-gpt.readthedocs.io/en/latest/getting_started/getting_started.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/getting_started/getting_started.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 22.35K --.-KB/s in 0.001s \n", + "\n", + "2023-06-13 09:30:58 (27.2 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/getting_started/getting_started.html’ saved [22891]\n", + "\n", + "--2023-06-13 09:30:58-- https://db-gpt.readthedocs.io/en/latest/getting_started/concepts.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/getting_started/concepts.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 14.37K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:59 (72.1 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/getting_started/concepts.html’ saved [14716]\n", + "\n", + "--2023-06-13 09:30:59-- https://db-gpt.readthedocs.io/en/latest/getting_started/tutorials.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/getting_started/tutorials.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 14.46K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:30:59 (62.8 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/getting_started/tutorials.html’ saved [14802]\n", + "\n", + "--2023-06-13 09:30:59-- https://db-gpt.readthedocs.io/en/latest/modules/llms.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/modules/llms.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 16.49K --.-KB/s in 0.001s \n", + "\n", + "2023-06-13 09:30:59 (26.9 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/modules/llms.html’ saved [16889]\n", + "\n", + "--2023-06-13 09:30:59-- https://db-gpt.readthedocs.io/en/latest/modules/prompts.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/modules/prompts.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 14.67K --.-KB/s in 0.001s \n", + "\n", + "2023-06-13 09:31:00 (26.2 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/modules/prompts.html’ saved [15022]\n", + "\n", + "--2023-06-13 09:31:00-- https://db-gpt.readthedocs.io/en/latest/modules/plugins.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/modules/plugins.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 14.72K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:31:00 (131 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/modules/plugins.html’ saved [15074]\n", + "\n", + "--2023-06-13 09:31:00-- https://db-gpt.readthedocs.io/en/latest/modules/connections.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/modules/connections.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 14.59K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:31:00 (82.8 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/modules/connections.html’ saved [14940]\n", + "\n", + "--2023-06-13 09:31:00-- https://db-gpt.readthedocs.io/en/latest/modules/knownledge.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/modules/knownledge.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 18.24K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:31:00 (56.9 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/modules/knownledge.html’ saved [18680]\n", + "\n", + "--2023-06-13 09:31:00-- https://db-gpt.readthedocs.io/en/latest/use_cases/sql_generation_and_diagnosis.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/use_cases/sql_generation_and_diagnosis.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 14.52K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:31:01 (321 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/use_cases/sql_generation_and_diagnosis.html’ saved [14871]\n", + "\n", + "--2023-06-13 09:31:01-- https://db-gpt.readthedocs.io/en/latest/use_cases/knownledge_based_qa.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/use_cases/knownledge_based_qa.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 18.40K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:31:01 (59.7 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/use_cases/knownledge_based_qa.html’ saved [18846]\n", + "\n", + "--2023-06-13 09:31:01-- https://db-gpt.readthedocs.io/en/latest/use_cases/chatbots.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/use_cases/chatbots.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 14.38K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:31:01 (178 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/use_cases/chatbots.html’ saved [14720]\n", + "\n", + "--2023-06-13 09:31:01-- https://db-gpt.readthedocs.io/en/latest/use_cases/query_database_data.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/use_cases/query_database_data.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 14.42K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:31:01 (160 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/use_cases/query_database_data.html’ saved [14771]\n", + "\n", + "--2023-06-13 09:31:01-- https://db-gpt.readthedocs.io/en/latest/use_cases/interacting_with_api.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/use_cases/interacting_with_api.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 14.48K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:31:02 (282 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/use_cases/interacting_with_api.html’ saved [14825]\n", + "\n", + "--2023-06-13 09:31:02-- https://db-gpt.readthedocs.io/en/latest/use_cases/tool_use_with_plugin.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/use_cases/tool_use_with_plugin.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 14.44K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:31:02 (287 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/use_cases/tool_use_with_plugin.html’ saved [14791]\n", + "\n", + "--2023-06-13 09:31:02-- https://db-gpt.readthedocs.io/en/latest/reference.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: unspecified [text/html]\n", + "Saving to: ‘rtdocs/db-gpt.readthedocs.io/en/latest/reference.html’\n", + "\n", + "db-gpt.readthedocs. [ <=> ] 13.94K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:31:02 (214 MB/s) - ‘rtdocs/db-gpt.readthedocs.io/en/latest/reference.html’ saved [14272]\n", + "\n", + "--2023-06-13 09:31:02-- https://db-gpt.readthedocs.io/en/latest/.getting_started/tutorials.html\n", + "Reusing existing connection to db-gpt.readthedocs.io:443.\n", + "HTTP request sent, awaiting response... 404 Not Found\n", + "2023-06-13 09:31:03 ERROR 404: Not Found.\n", + "\n", + "FINISHED --2023-06-13 09:31:03--\n", + "Total wall clock time: 6.2s\n", + "Downloaded: 20 files, 306K in 0.006s (50.3 MB/s)\n", + "--2023-06-13 09:31:03-- http://agentgpt/\n", + "Resolving agentgpt (agentgpt)... failed: Name or service not known.\n", + "wget: unable to resolve host address ‘agentgpt’\n", + "--2023-06-13 09:31:03-- https://docs.reworkd.ai/\n", + "Resolving docs.reworkd.ai (docs.reworkd.ai)... 76.76.21.21\n", + "Connecting to docs.reworkd.ai (docs.reworkd.ai)|76.76.21.21|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 14863 (15K) [text/html]\n", + "Saving to: ‘rtdocs/docs.reworkd.ai/index.html’\n", + "\n", + "docs.reworkd.ai/ind 100%[===================>] 14.51K --.-KB/s in 0s \n", + "\n", + "2023-06-13 09:31:03 (50.0 MB/s) - ‘rtdocs/docs.reworkd.ai/index.html’ saved [14863/14863]\n", + "\n", + "FINISHED --2023-06-13 09:31:03--\n", + "Total wall clock time: 0.3s\n", + "Downloaded: 1 files, 15K in 0s (50.0 MB/s)\n" + ] + } + ], + "source": [ + "# !wget -r -A.html -P rtdocs Fixie https://docs.fixie.ai/ # Y\n", + "# !wget -r -A.html -P rtdocs LangChain https://python.langchain.com/en/latest/ # y\n", + "# !wget -r -A.html -P rtdocs GPT index https://gpt-index.readthedocs.io/en/latest/ # y custom\n", + "# !wget -r -A.html -P rtdocs GPTCache https://gptcache.readthedocs.io/en/latest/ # y\n", + "# !wget -r -A.html -P rtdocs GPT4ALL https://docs.gpt4all.io/ # y custom\n", + "\n", + "!wget -r -A.html -P rtdocs Auto GPT https://docs.agpt.co/ # y\n", + "!wget -r -A.html -P rtdocs db-GPT https://db-gpt.readthedocs.io/en/latest/ # y custom\n", + "!wget -r -A.html -P rtdocs AgentGPT https://docs.reworkd.ai/ #" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EFxDxeEfD3a9", + "outputId": "47c478ec-1df2-4791-8f04-0de06d770eb0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "new loader\n" + ] + } + ], + "source": [ + "custom = True\n", + "file_path = \"/content/rtdocs/docs.reworkd.ai\"\n", + "if custom:\n", + " loader = ReadTheDocsLoader_custom(file_path, features='html.parser')\n", + "else:\n", + " loader = ReadTheDocsLoader(file_path, features='html.parser')" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dyZ4gQH9HMu3", + "outputId": "c3728716-cef4-4440-c2e4-1e087ee11b2c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Welcome to AgentGPT 👋, an autonomous AI Agent platform that empowers users to create and deploy customizable autonomous AI agents directly in the web.\n", + "Simply assign a name and goal to your AI agent, and watch as it embarks on an exciting journey to accomplish the assigned objective.\n", + "AgentGPT works by chaining language models (Agents) to perform a given goal.\n", + "Recursively, an agent will think of the best tasks to perform a goal, execute on those tasks, evaluate how it performed, and continually think of more tasks.\n", + "We think the power of AI should be available to everyone and should be driven by community.\n", + "This is why we are proudly open source. We'd love to hear your feedback at every step of the journey and the following channels are open for you to reach the team:\n", + "The entire AgentGPT team is incredibly excited for the road to follow.\n", + "We have a ton of exciting features planned for the future, and we hope you'll join us on this journey.\n", + "Have a glimpse of what's to come in the roadmap on the next page.\n" + ] + } + ], + "source": [ + "documents = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DLkGn2tHTzZn", + "outputId": "325cbf32-9ed9-42b0-85cf-f9dee59577c7" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(lc_kwargs={'page_content': \"Welcome to AgentGPT 👋, an autonomous AI Agent platform that empowers users to create and deploy customizable autonomous AI agents directly in the web.\\nSimply assign a name and goal to your AI agent, and watch as it embarks on an exciting journey to accomplish the assigned objective.AgentGPT works by chaining language models (Agents) to perform a given goal.\\nRecursively, an agent will think of the best tasks to perform a goal, execute on those tasks, evaluate how it performed, and continually think of more tasks.We think the power of AI should be available to everyone and should be driven by community.\\nThis is why we are proudly open source. We'd love to hear your feedback at every step of the journey and the following channels are open for you to reach the team:The entire AgentGPT team is incredibly excited for the road to follow.\\nWe have a ton of exciting features planned for the future, and we hope you'll join us on this journey.\\nHave a glimpse of what's to come in the roadmap on the next page.\", 'metadata': {'source': '/content/rtdocs/docs.reworkd.ai/index.html'}}, page_content=\"Welcome to AgentGPT 👋, an autonomous AI Agent platform that empowers users to create and deploy customizable autonomous AI agents directly in the web.\\nSimply assign a name and goal to your AI agent, and watch as it embarks on an exciting journey to accomplish the assigned objective.AgentGPT works by chaining language models (Agents) to perform a given goal.\\nRecursively, an agent will think of the best tasks to perform a goal, execute on those tasks, evaluate how it performed, and continually think of more tasks.We think the power of AI should be available to everyone and should be driven by community.\\nThis is why we are proudly open source. We'd love to hear your feedback at every step of the journey and the following channels are open for you to reach the team:The entire AgentGPT team is incredibly excited for the road to follow.\\nWe have a ton of exciting features planned for the future, and we hope you'll join us on this journey.\\nHave a glimpse of what's to come in the roadmap on the next page.\", metadata={'source': '/content/rtdocs/docs.reworkd.ai/index.html'})" + ] + }, + "execution_count": 157, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": { + "id": "26YQFYzYyqC4" + }, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import FAISS\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": { + "id": "OJf6GU0Eyv3R" + }, + "outputs": [], + "source": [ + "\n", + "text_splitter = CharacterTextSplitter(separator =\"\\n\", chunk_size=2000, chunk_overlap=100) # need to modify separator for different documents\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FHoyqckZ2C5R" + }, + "outputs": [], + "source": [ + "docs[2]" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ojgaHZJ-zLZA", + "outputId": "cdb46bd2-72b5-4ab8-ffce-8a599e9368a4" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 160, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for doc in docs:\n", + " if len(doc.page_content) == 0:\n", + " docs.remove(doc)\n", + "\n", + "len(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": { + "id": "qkszARy4zyCo" + }, + "outputs": [], + "source": [ + "texts = []\n", + "meta = []\n", + "\n", + "for doc in docs:\n", + " texts.append(doc.page_content)\n", + "\n", + " metadata = {}\n", + " source = doc.metadata['source'].replace(\"/content/rtdocs/\",\"\")\n", + " metadata[\"source\"] = source\n", + " meta.append(metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CphmhDV62rS8", + "outputId": "96ab8cfa-e380-4e56-c30b-720a6e864654" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'source': 'python.langchain.com/en/latest/search.html'}" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BTLdeq9i0Rtf" + }, + "source": [ + "# File" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "id": "kkxENgJe0TZn" + }, + "outputs": [], + "source": [ + "from langchain.embeddings import OpenAIEmbeddings\n", + "from langchain.embeddings.base import Embeddings\n", + "from langchain.vectorstores.base import VectorStore\n", + "import pinecone\n", + "import os\n", + "import uuid\n", + "from typing import Any, Iterable, List, Optional, Type\n", + "from langchain.docstore.document import Document\n", + "import logging\n", + "from langchain.vectorstores.base import VectorStoreRetriever\n", + "\n", + "PINECONE_API_KEY = os.environ.get(\"PINECONE_API_KEY\")\n", + "PINECONE_ENV = os.environ.get(\"PINECONE_ENV\")\n", + "\n", + "\n", + "class ConversationStore(VectorStore):\n", + " def __init__(self, namespace, db, embeddings, text_key):\n", + " self.db = db\n", + " self.namespace = namespace\n", + " self.embeddings = embeddings\n", + " self.text_key = text_key\n", + "\n", + " @classmethod\n", + " def from_index(cls, namespace, openai_api_key, index_name, text_key=\"text\"):\n", + " print(f\"Loading index {index_name} from Pinecone\")\n", + " pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)\n", + " logging.info(f\"Loading index {index_name} from Pinecone\")\n", + " index = pinecone.Index(index_name)\n", + " embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)\n", + " return cls(namespace, index, embeddings, text_key)\n", + "\n", + " def add_text(self, text: str, metadata={}) -> str:\n", + " source = metadata[\"source\"]\n", + " metadata = {self.text_key: text, \"source\": source}\n", + " id = str(uuid.uuid4())\n", + " embedding = self.embeddings.embed_query(text)\n", + " doc = {\"id\": id, \"values\": embedding, \"metadata\": metadata}\n", + "\n", + " self.db.upsert(vectors=[doc], namespace=self.namespace)\n", + "\n", + " return id\n", + "\n", + " def add_texts(self, texts: Iterable[str], metadatas: List[dict]) -> List[str]:\n", + " for text, metadata in zip(texts, metadatas):\n", + " self.add_text(text, metadata)\n", + "\n", + " def similarity_search(\n", + " self,\n", + " text: str,\n", + " top_k: int = 5,\n", + " filter: Optional[dict] = None,\n", + " threshold: float = 0.7,\n", + " ) -> list[Document]:\n", + " query_embedding = self.embeddings.embed_query(text)\n", + " results = self.db.query(\n", + " [query_embedding],\n", + " top_k=top_k,\n", + " include_metadata=True,\n", + " namespace=self.namespace,\n", + " filter=filter,\n", + " )\n", + "\n", + " # print(results)\n", + " docs = []\n", + " for res in results[\"matches\"]:\n", + " metadata = res[\"metadata\"]\n", + " text = metadata.pop(self.text_key)\n", + " if res[\"score\"] > threshold:\n", + " docs.append(Document(page_content=text, metadata=metadata))\n", + " return docs\n", + "\n", + " @classmethod\n", + " def delete(cls, namespace, index_name):\n", + " pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)\n", + " index = pinecone.Index(index_name)\n", + " return index.delete(delete_all=True, namespace=namespace)\n", + "\n", + " @classmethod\n", + " def get_vector_retrieval(\n", + " cls, namespace: str, openai_api_key: str, index_name: str\n", + " ) -> VectorStoreRetriever:\n", + " vectorstore = cls.from_index(namespace, openai_api_key, index_name)\n", + " retriever = VectorStoreRetriever(vectorstore=vectorstore)\n", + " return retriever\n", + "\n", + " @classmethod\n", + " def from_texts(cls, texts: List[str], embedding: Embeddings, metadatas: list[dict]):\n", + " raise NotImplementedError(\"ConversationStore does not support from_texts\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s13JzbhM0WZH" + }, + "source": [ + "# Upload docs" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "id": "Oo0d4TTq4Ljf" + }, + "outputs": [], + "source": [ + "PINECONE_INDEX=\"langchain\"\n", + "PINECONE_API_KEY=\"5b33d948-9710-42f7-8de0-1bf35a6bf54c\"\n", + "PINECONE_ENV=\"northamerica-northeast1-gcp\"" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "xbJAiBEx4zH3", + "outputId": "a0bf4628-630e-4c57-a684-0760524aeee4" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'5b33d948-9710-42f7-8de0-1bf35a6bf54c'" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "PINECONE_API_KEY" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "ia2ENUiR40EF", + "outputId": "adb65f88-c1c5-4d4a-a3c1-af1c0b9c6628" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'northamerica-northeast1-gcp'" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "PINECONE_ENV" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "id": "ySnO4vqk4AT-" + }, + "outputs": [], + "source": [ + "pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)\n", + "index = pinecone.Index(\"langchain\")\n", + "embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)\n", + "\n", + "vectorstore = ConversationStore(\"ReadTheDocs\", index, embeddings, 'text')" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iZEI-bEY7Nu3", + "outputId": "3aee1959-94e1-4bec-aee9-614c33eca784" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{}" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# index.delete(delete_all=True, namespace=\"ReadTheDocs\")" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WCEwc9tL3zxN", + "outputId": "6efb0cde-4176-4529-aafc-226507e2da95" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'dimension': 1536,\n", + " 'index_fullness': 0.0,\n", + " 'namespaces': {'01662450-28d4-49b7-8484-5f27f34a2cdc': {'vector_count': 24},\n", + " '023e8e97-eea9-4538-9fd7-afad634087df': {'vector_count': 3},\n", + " '06596c06-638b-4dd2-954f-acd5e21c8231': {'vector_count': 1},\n", + " '06e6ba9b-a273-4148-99c0-1613b5f9f239': {'vector_count': 2},\n", + " '0fc4f143-60b0-44b3-9865-1e85db59299c': {'vector_count': 1},\n", + " '11873f72-c580-4fea-83fe-e4bbe72f0410': {'vector_count': 1},\n", + " '1204cb97-7bf6-4617-8897-8355fa8da366': {'vector_count': 1},\n", + " '16f0978b-32e5-4115-90e2-09690d718331': {'vector_count': 92},\n", + " '175a95ed-6317-40e6-88ed-095f4d5dc03c': {'vector_count': 6},\n", + " '195bff2d-2aa4-40e3-9a03-443347da8f61': {'vector_count': 7},\n", + " '19cd51a5-e33c-4f5f-bca9-3bca31391006': {'vector_count': 30},\n", + " '1cf26fb0-89f1-4db8-a26a-45fe1e29244a': {'vector_count': 18},\n", + " '2538d125-438c-400b-b717-e9c764047f09': {'vector_count': 4},\n", + " '29e6e154-2231-4cc9-87d4-1c4be09884c1': {'vector_count': 1},\n", + " '2c7009c2-419e-40d1-8aab-ca7270a0db6e-0-0': {'vector_count': 3},\n", + " '2f77a1a8-a5ba-4d86-be99-62caf445c1ec': {'vector_count': 2},\n", + " '345e0fd8-13cf-4a36-beb8-2e48a531b18d-0-0': {'vector_count': 3},\n", + " '34be3597-a76f-4165-a580-f5b7a537a7f1-0-0': {'vector_count': 2},\n", + " '34be3597-a76f-4165-a580-f5b7a537a7f1-1-0': {'vector_count': 1},\n", + " '35392789-c88b-485a-b80b-1f6dbf0678a3': {'vector_count': 1},\n", + " '3841e98c-8948-4c78-9adf-460b67078d61': {'vector_count': 7},\n", + " '386f8f29-f454-41f3-83d7-08dd4051e5ea': {'vector_count': 16},\n", + " '3cb4ca65-65d0-482f-a8f9-6f517bdf2ff2': {'vector_count': 2},\n", + " '3dd891ae-4466-4a78-8194-77d4e2c6fad9': {'vector_count': 1},\n", + " '3feb2f47-4f11-4d48-b836-731494b03b34': {'vector_count': 1},\n", + " '41953c44-7c42-4396-8989-f446236c1754': {'vector_count': 1},\n", + " '45145592-3f61-4f4d-adbc-1964267f7b01': {'vector_count': 1},\n", + " '463cb62c-3c9f-4c5f-afe5-1a1f6714cd1e': {'vector_count': 1},\n", + " '4640102d-71ad-4ee2-aa6b-6e0186bb2ca8': {'vector_count': 1},\n", + " '46b01f94-a33a-47c7-bc6d-4a67ad3bdbe3': {'vector_count': 55},\n", + " '49a0ae07-6c52-4b53-9ad2-27834a8b2ad8': {'vector_count': 2},\n", + " '4ace051e-a06f-41ac-8347-a0219ff13620': {'vector_count': 15},\n", + " '4b30804e-ec4e-4660-acbc-dcd1aad852d2': {'vector_count': 2},\n", + " '4f2259d0-4978-46b5-87b9-01494dbf52e6': {'vector_count': 1},\n", + " '5038f61b-94fe-4d57-8893-b940cf081d1c': {'vector_count': 1},\n", + " '51877c86-1097-4ab5-b96d-d9908f9b8c79': {'vector_count': 3},\n", + " '59418804-bd27-49d2-afd9-85232b275372': {'vector_count': 6},\n", + " '5bfca4c0-3751-4c23-a0bb-71cd3b80bba8': {'vector_count': 15},\n", + " '5d69c525-89e2-48b8-86a6-ef8489beb465-0-0': {'vector_count': 4},\n", + " '5d69c525-89e2-48b8-86a6-ef8489beb465-1-0': {'vector_count': 2},\n", + " '5e7d6969-108c-400c-8e3a-3607e666df70': {'vector_count': 7},\n", + " '5fc58d0e-a1eb-44fd-bc21-66f25313dd98': {'vector_count': 1},\n", + " '60a18b0b-cfc7-4b93-b62e-47e1e62027d1': {'vector_count': 2},\n", + " '60bfe64e-2dd8-40e4-bd7d-1e033dc3db18': {'vector_count': 10},\n", + " '62d6c8e2-babc-4f2c-9f01-76ee5e513b27': {'vector_count': 30},\n", + " '6441f86c-274c-485f-9849-2f0e30022eec': {'vector_count': 6},\n", + " '64f41368-91f0-482c-8b70-e979cbf03426': {'vector_count': 1},\n", + " '6581bd08-8768-4bc3-8dec-66f9a9a6c81f': {'vector_count': 20},\n", + " '65da3b01-0956-4298-988e-bf0da26f367a': {'vector_count': 3},\n", + " '67c5b8ae-8c48-4f18-9267-cbafec896cfd': {'vector_count': 6},\n", + " '68ba8730-914a-44b6-9a02-9a7a0d5ee3d9': {'vector_count': 6},\n", + " '74164868-8548-43fb-9f94-515c13f3103a': {'vector_count': 2},\n", + " '7561a19f-1c90-45a0-93af-2a5e57172419': {'vector_count': 2},\n", + " '77813683-3099-44e1-bfe9-7bfaedadbaf1': {'vector_count': 38},\n", + " '7d25d04c-1c0b-4869-94fb-3c1699b4a9c1-0-0': {'vector_count': 7},\n", + " '7e4b3ad0-f746-4366-b6ce-5aa45702275b': {'vector_count': 1},\n", + " '7fed1522-9761-4ce6-9553-c0597519c0cc': {'vector_count': 1},\n", + " '80ad2f67-e385-4d37-8acc-07a5ea137ef3': {'vector_count': 15},\n", + " '8dfb77d0-6bd4-4aa9-ac26-bbe52d5972c0-0-0': {'vector_count': 3},\n", + " '8ef0140a-64a5-4c5b-b564-b3a466f74e99': {'vector_count': 1},\n", + " '907ba889-b54f-4442-a376-32287b3c2147-0-0': {'vector_count': 3},\n", + " '907ba889-b54f-4442-a376-32287b3c2147-1-0': {'vector_count': 1},\n", + " '98955d2e-bdd7-4087-8471-63c851100cf7': {'vector_count': 6},\n", + " '999e9bba-2f88-4436-973b-16381a4553bc': {'vector_count': 16},\n", + " '9f835454-e2e2-4909-8331-93458c42fd39': {'vector_count': 1},\n", + " 'ReadTheDocs': {'vector_count': 4540},\n", + " 'ReadTheDocs_test': {'vector_count': 95},\n", + " 'a43d23f3-6114-4aa6-a33e-5eb9391791ec': {'vector_count': 3},\n", + " 'a5e7d04c-5914-4586-9be9-bd996cafdb21': {'vector_count': 51},\n", + " 'ad3dc281-9d3c-4504-befe-890645b8798e-0-0': {'vector_count': 2},\n", + " 'ad3dc281-9d3c-4504-befe-890645b8798e-1-0': {'vector_count': 3},\n", + " 'b1a5cd8b-d5f4-4286-94f6-a755e29e01e8': {'vector_count': 10},\n", + " 'b25e579c-acdf-4a94-83eb-f6ff0252e382': {'vector_count': 3},\n", + " 'b3624fc9-8e96-4aef-813d-b88f239ff3e1': {'vector_count': 6},\n", + " 'b3a05001-7042-4a60-bb67-23739bf39f7f': {'vector_count': 2},\n", + " 'b5906fdc-d564-4e1f-b261-c48f236eb29e': {'vector_count': 1},\n", + " 'b649e762-3578-4017-b54f-1fa98678eb94': {'vector_count': 4},\n", + " 'ba8901af-af54-4e33-a460-81903f33e3de': {'vector_count': 1},\n", + " 'bbe59a3c-0ab5-47fa-95b9-7c42628712d5': {'vector_count': 5},\n", + " 'bd9dd710-316d-4395-840f-92106686a7b5': {'vector_count': 1},\n", + " 'bdf196f9-de25-4fce-8f3b-98a7e25118ec': {'vector_count': 7},\n", + " 'be8d554e-beb7-4ea9-aac8-3fc4e28d8388': {'vector_count': 3},\n", + " 'c0085ba7-b220-4d48-8f3d-14f62c15a2f3': {'vector_count': 18},\n", + " 'c4394301-a1b6-44b4-9ffb-9e2d404819ea': {'vector_count': 1},\n", + " 'c63776e4-4496-496c-bc7c-c7df1471bcd4': {'vector_count': 30},\n", + " 'c94fabff-c774-43a8-bc34-c9663e668739': {'vector_count': 6},\n", + " 'c9e76757-5fc0-48df-8242-389e87b21dcf': {'vector_count': 7},\n", + " 'cdcd065d-6485-4411-8e8b-4c95ae6c95be': {'vector_count': 1},\n", + " 'cfb63abc-a149-4e3c-b607-1698ee46ea51-0-0': {'vector_count': 4},\n", + " 'd041c90f-b801-4aca-b91b-9e32426f48c4-0-0': {'vector_count': 3},\n", + " 'd041c90f-b801-4aca-b91b-9e32426f48c4-1-0': {'vector_count': 1},\n", + " 'd35fcc74-1d05-484b-87c9-9ee2535ce799': {'vector_count': 6},\n", + " 'd555e569-bc94-4cf1-8dab-433fc14c6668': {'vector_count': 38},\n", + " 'd7119bff-c421-4b2d-8f56-15e75dbf0ab9': {'vector_count': 1},\n", + " 'd95e8f89-5ff5-477d-a01c-d8acaa4aef08': {'vector_count': 13},\n", + " 'e00894b9-2106-459c-a4e3-885d678548ec': {'vector_count': 13},\n", + " 'e0ebdc4b-1ec3-4cba-9416-498455d8a35c': {'vector_count': 7},\n", + " 'e1ecc3db-54c1-4951-9536-b3a4a39935f8': {'vector_count': 2},\n", + " 'e1f80bf6-18ac-45ba-bd6b-280ae4b21199': {'vector_count': 20},\n", + " 'e8c7479d-cbe8-4c6d-a863-b48d5e15c81a-0-0': {'vector_count': 2},\n", + " 'e8c7479d-cbe8-4c6d-a863-b48d5e15c81a-1-0': {'vector_count': 2},\n", + " 'ecda672f-4954-4a59-8eed-e32934fd2e23': {'vector_count': 1},\n", + " 'f100853d-d41b-49d0-8827-46cd696b9624': {'vector_count': 5},\n", + " 'f1a31140-92f8-4250-bd7b-11e8d0a4e92d': {'vector_count': 4},\n", + " 'f982e6dd-f84d-40ac-ac00-0a84c03fedab': {'vector_count': 1},\n", + " 'fe1a41eb-63df-402b-9272-3fd0195b1f39': {'vector_count': 13},\n", + " 'session_id-0-0': {'vector_count': 2},\n", + " 'session_id-1-0': {'vector_count': 4},\n", + " 'session_id-1-1': {'vector_count': 3},\n", + " 'session_id-1-2': {'vector_count': 2},\n", + " 'session_id-2-0': {'vector_count': 1},\n", + " 'session_id-3-0': {'vector_count': 1},\n", + " 'session_id-3-1': {'vector_count': 1},\n", + " 'session_id-3-2': {'vector_count': 1},\n", + " 'session_id-4-0': {'vector_count': 1},\n", + " 'session_id-4-1': {'vector_count': 1},\n", + " 'session_id-4-2': {'vector_count': 1},\n", + " 'session_id-5-0': {'vector_count': 2},\n", + " 'session_id-6-0': {'vector_count': 3},\n", + " 'session_id-6-1': {'vector_count': 3},\n", + " 'session_id-7-0': {'vector_count': 3},\n", + " 'session_id-7-1': {'vector_count': 1},\n", + " 'session_id-8-0': {'vector_count': 2},\n", + " 'test-1': {'vector_count': 4}},\n", + " 'total_vector_count': 5501}" + ] + }, + "execution_count": 162, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vectorstore.db.describe_index_stats()" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": { + "id": "I26Pf-N34LMp" + }, + "outputs": [], + "source": [ + "vectorstore.add_texts(texts, meta)" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "H7BKest_wfnb", + "outputId": "f6f67113-c81c-4008-889c-18b2d63c9b92" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'source': 'python.langchain.com/en/latest/modules/chains/examples/flare.html'}" + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d = vectorstore.similarity_search(\"what is langchain\")\n", + "# result = \"\"\n", + "# for doc in docs:\n", + "# result += doc.page_content + \"\\n\\n\"\n", + "\n", + "# print(result)\n", + "\n", + "d[0].metadata" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "f50e_splyjDA", + "65xCJilpgWqm", + "BTLdeq9i0Rtf" + ], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}