From 0c5a05ac5948cdd3716af5d7e5f476cf0bd58096 Mon Sep 17 00:00:00 2001 From: PrashantDixit-dev Date: Mon, 23 Dec 2024 12:08:59 +0530 Subject: [PATCH] demo gif --- README.md | 2 +- examples/RAG-On-PDF/main.ipynb | 1534 +++++++++-------- examples/congee-RAG/README.md | 86 + examples/congee-RAG/cognee_demo.ipynb | 1045 +++++++++++ .../convert-any-image-dataset-to-lance.py | 75 +- .../main.ipynb | 472 ++--- 6 files changed, 2196 insertions(+), 1018 deletions(-) create mode 100644 examples/congee-RAG/README.md create mode 100644 examples/congee-RAG/cognee_demo.ipynb diff --git a/README.md b/README.md index f8b1e2b..029197c 100644 --- a/README.md +++ b/README.md @@ -191,7 +191,7 @@ These are ready to use applications built using LanceDB serverless vector databa |-----------------------------------------------------|----------------------------------------------------------------------------------------------------------------------|-------------------------------------------| | [Writing assistant](https://github.com/lancedb/vectordb-recipes/tree/main/applications/node/lanchain_writing_assistant) | Writing assistant app using lanchain.js with LanceDB, allows you to get real time relevant suggestions and facts based on you written text to help you with your writing. | ![Writing assistant](https://github.com/user-attachments/assets/87354e93-df4d-40ad-922b-abcbb62d667c) | | [Sentence auto complete](https://github.com/lancedb/vectordb-recipes/tree/main/applications/node/sentance_auto_complete) | Sentance auto complete app using lanchain.js with LanceDB, allows you to get real time relevant auto complete suggestions and facts based on you written text to help you with your writing.You can also upload your data source in the form of a pdf file.You can switch between gpt models to get faster results. | ![Sentance auto complete](https://github.com/lancedb/assets/blob/main/recipes/sentance_Auto_complete.gif) | -| [Article Recommendation](https://github.com/lancedb/vectordb-recipes/tree/main/applications/node/article_recommender) | Article Recommender: Explore vast data set of articles with Instant, Context-Aware Suggestions. Leveraging Advanced NLP, Vector Search, and Customizable Datasets, Our App Delivers Real-Time, Precise Article Recommendations. Perfect for Research, Content Curation, and Staying Informed. Unlock Smarter Insights with State-of-the-Art Technology in Content Retrieval and Discovery!". | ![Article Recommendation](https://github.com/lancedb/assets/blob/main/recipes/article_recommendation_engine.gif) | +| [Article Recommendation](https://github.com/lancedb/vectordb-recipes/tree/main/applications/node/article_recommender) | Article Recommender: Explore vast data set of articles with Instant, Context-Aware Suggestions. Leveraging Advanced NLP, Vector Search, and Customizable Datasets, Our App Delivers Real-Time, Precise Article Recommendations. Perfect for Research, Content Curation, and Staying Informed. Unlock Smarter Insights with State-of-the-Art Technology in Content Retrieval and Discovery!". | ![Article Recommendation](./applications/node/article_recommender/public/assets/article_recommendation_engine.gif) | |||| | Project Name | Description | Screenshot | diff --git a/examples/RAG-On-PDF/main.ipynb b/examples/RAG-On-PDF/main.ipynb index e4b1898..ac57193 100644 --- a/examples/RAG-On-PDF/main.ipynb +++ b/examples/RAG-On-PDF/main.ipynb @@ -1,780 +1,794 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "gpuType": "T4" - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "accelerator": "GPU" + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# FarmerGPT - Build Chat with PDF using lancedb" - ], - "metadata": { - "id": "10yLPb9Fneh0" - } - }, - { - "cell_type": "markdown", - "source": [ - "We utilized this PDF: Dataset/FARM5CROPS_farmergpt.pdf,\n", - " which contains crop variety information for sugarcane, turmeric, bamboo, cashew nuts, and more.\n", - " This is a sample project designed to demonstrate how to build an application using LanceDB and LangChain\n", - "The use case and prompts can be customized as needed to suit specific requirements" - ], - "metadata": { - "id": "xFHTAtayosS-" - } - }, - { - "cell_type": "markdown", - "source": [ - "Import pacages" - ], - "metadata": { - "id": "eXsCZEUvb39u" - } - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "EaKlPN9u6bnc", - "outputId": "89f72bd3-ddf9-4ccc-a249-50d50e4012be" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: langchain in /usr/local/lib/python3.10/dist-packages (0.3.12)\n", - "Collecting lancedb\n", - " Downloading lancedb-0.17.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (4.7 kB)\n", - "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.2)\n", - "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.36)\n", - "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.11.10)\n", - "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n", - "Requirement already satisfied: langchain-core<0.4.0,>=0.3.25 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.3.25)\n", - "Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.3.3)\n", - "Requirement already satisfied: langsmith<0.3,>=0.1.17 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.2.3)\n", - "Requirement already satisfied: numpy<2,>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.26.4)\n", - "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.10.3)\n", - "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.32.3)\n", - "Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (9.0.0)\n", - "Collecting deprecation (from lancedb)\n", - " Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)\n", - "Collecting pylance==0.20.0 (from lancedb)\n", - " Downloading pylance-0.20.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (7.4 kB)\n", - "Requirement already satisfied: tqdm>=4.27.0 in /usr/local/lib/python3.10/dist-packages (from lancedb) (4.67.1)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from lancedb) (24.2)\n", - "Collecting overrides>=0.7 (from lancedb)\n", - " Downloading overrides-7.7.0-py3-none-any.whl.metadata (5.8 kB)\n", - "Requirement already satisfied: pyarrow>=14 in /usr/local/lib/python3.10/dist-packages (from pylance==0.20.0->lancedb) (17.0.0)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (2.4.4)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.2)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (24.3.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.5.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.1.0)\n", - "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (0.2.1)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.18.3)\n", - "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.25->langchain) (1.33)\n", - "Requirement already satisfied: typing-extensions>=4.7 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.25->langchain) (4.12.2)\n", - "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.17->langchain) (0.28.1)\n", - "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.17->langchain) (3.10.12)\n", - "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.17->langchain) (1.0.0)\n", - "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain) (0.7.0)\n", - "Requirement already satisfied: pydantic-core==2.27.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain) (2.27.1)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.4.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.10)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.2.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2024.12.14)\n", - "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.1.1)\n", - "Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (3.7.1)\n", - "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (1.0.7)\n", - "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (0.14.0)\n", - "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.25->langchain) (3.0.0)\n", - "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (1.3.1)\n", - "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (1.2.2)\n", - "Downloading lancedb-0.17.0-cp39-abi3-manylinux_2_28_x86_64.whl (29.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m29.9/29.9 MB\u001b[0m \u001b[31m51.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading pylance-0.20.0-cp39-abi3-manylinux_2_28_x86_64.whl (33.5 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m33.5/33.5 MB\u001b[0m \u001b[31m16.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading overrides-7.7.0-py3-none-any.whl (17 kB)\n", - "Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)\n", - "Installing collected packages: overrides, deprecation, pylance, lancedb\n", - "Successfully installed deprecation-2.1.0 lancedb-0.17.0 overrides-7.7.0 pylance-0.20.0\n", - "Collecting tantivy\n", - " Downloading tantivy-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.2 kB)\n", - "Downloading tantivy-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m40.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: tantivy\n", - "Successfully installed tantivy-0.22.0\n", - "Collecting langchain-openai\n", - " Downloading langchain_openai-0.2.14-py3-none-any.whl.metadata (2.7 kB)\n", - "Collecting langchain-community\n", - " Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)\n", - "Collecting pypdf\n", - " Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)\n", - "Collecting langchain-core<0.4.0,>=0.3.27 (from langchain-openai)\n", - " Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)\n", - "Collecting openai<2.0.0,>=1.58.1 (from langchain-openai)\n", - " Downloading openai-1.58.1-py3-none-any.whl.metadata (27 kB)\n", - "Collecting tiktoken<1,>=0.7 (from langchain-openai)\n", - " Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n", - "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (6.0.2)\n", - "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.0.36)\n", - "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (3.11.10)\n", - "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)\n", - " Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)\n", - "Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)\n", - " Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)\n", - "Collecting langchain<0.4.0,>=0.3.13 (from langchain-community)\n", - " Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)\n", - "Requirement already satisfied: langsmith<0.3,>=0.1.125 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.2.3)\n", - "Requirement already satisfied: numpy<2,>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (1.26.4)\n", - "Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)\n", - " Downloading pydantic_settings-2.7.0-py3-none-any.whl.metadata (3.5 kB)\n", - "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.32.3)\n", - "Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (9.0.0)\n", - "Requirement already satisfied: typing_extensions>=4.0 in /usr/local/lib/python3.10/dist-packages (from pypdf) (4.12.2)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (2.4.4)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.3.2)\n", - "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (4.0.3)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (24.3.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.5.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (6.1.0)\n", - "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (0.2.1)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.18.3)\n", - "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)\n", - " Downloading marshmallow-3.23.2-py3-none-any.whl.metadata (7.1 kB)\n", - "Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)\n", - " Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)\n", - "Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.3 in /usr/local/lib/python3.10/dist-packages (from langchain<0.4.0,>=0.3.13->langchain-community) (0.3.3)\n", - "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /usr/local/lib/python3.10/dist-packages (from langchain<0.4.0,>=0.3.13->langchain-community) (2.10.3)\n", - "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.27->langchain-openai) (1.33)\n", - "Requirement already satisfied: packaging<25,>=23.2 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.27->langchain-openai) (24.2)\n", - "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.125->langchain-community) (0.28.1)\n", - "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.125->langchain-community) (3.10.12)\n", - "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.125->langchain-community) (1.0.0)\n", - "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.58.1->langchain-openai) (3.7.1)\n", - "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.58.1->langchain-openai) (1.9.0)\n", - "Requirement already satisfied: jiter<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.58.1->langchain-openai) (0.8.2)\n", - "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.58.1->langchain-openai) (1.3.1)\n", - "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.58.1->langchain-openai) (4.67.1)\n", - "Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)\n", - " Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.4.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.10)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2.2.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2024.12.14)\n", - "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain-community) (3.1.1)\n", - "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken<1,>=0.7->langchain-openai) (2024.11.6)\n", - "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=1.58.1->langchain-openai) (1.2.2)\n", - "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (1.0.7)\n", - "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (0.14.0)\n", - "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.27->langchain-openai) (3.0.0)\n", - "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.13->langchain-community) (0.7.0)\n", - "Requirement already satisfied: pydantic-core==2.27.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.13->langchain-community) (2.27.1)\n", - "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)\n", - " Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)\n", - "Downloading langchain_openai-0.2.14-py3-none-any.whl (50 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.9/50.9 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading langchain_community-0.3.13-py3-none-any.whl (2.5 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.5/2.5 MB\u001b[0m \u001b[31m50.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading pypdf-5.1.0-py3-none-any.whl (297 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.0/298.0 kB\u001b[0m \u001b[31m27.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)\n", - "Downloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)\n", - "Downloading langchain-0.3.13-py3-none-any.whl (1.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m57.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading langchain_core-0.3.28-py3-none-any.whl (411 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m411.6/411.6 kB\u001b[0m \u001b[31m33.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading openai-1.58.1-py3-none-any.whl (454 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.3/454.3 kB\u001b[0m \u001b[31m26.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading pydantic_settings-2.7.0-py3-none-any.whl (29 kB)\n", - "Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m68.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading marshmallow-3.23.2-py3-none-any.whl (49 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n", - "Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", - "Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", - "Installing collected packages: python-dotenv, pypdf, mypy-extensions, marshmallow, httpx-sse, typing-inspect, tiktoken, pydantic-settings, openai, dataclasses-json, langchain-core, langchain-openai, langchain, langchain-community\n", - " Attempting uninstall: openai\n", - " Found existing installation: openai 1.57.4\n", - " Uninstalling openai-1.57.4:\n", - " Successfully uninstalled openai-1.57.4\n", - " Attempting uninstall: langchain-core\n", - " Found existing installation: langchain-core 0.3.25\n", - " Uninstalling langchain-core-0.3.25:\n", - " Successfully uninstalled langchain-core-0.3.25\n", - " Attempting uninstall: langchain\n", - " Found existing installation: langchain 0.3.12\n", - " Uninstalling langchain-0.3.12:\n", - " Successfully uninstalled langchain-0.3.12\n", - "Successfully installed dataclasses-json-0.6.7 httpx-sse-0.4.0 langchain-0.3.13 langchain-community-0.3.13 langchain-core-0.3.28 langchain-openai-0.2.14 marshmallow-3.23.2 mypy-extensions-1.0.0 openai-1.58.1 pydantic-settings-2.7.0 pypdf-5.1.0 python-dotenv-1.0.1 tiktoken-0.8.0 typing-inspect-0.9.0\n", - "Collecting gradio\n", - " Downloading gradio-5.9.1-py3-none-any.whl.metadata (16 kB)\n", - "Collecting aiofiles<24.0,>=22.0 (from gradio)\n", - " Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)\n", - "Requirement already satisfied: anyio<5.0,>=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.7.1)\n", - "Collecting fastapi<1.0,>=0.115.2 (from gradio)\n", - " Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)\n", - "Collecting ffmpy (from gradio)\n", - " Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)\n", - "Collecting gradio-client==1.5.2 (from gradio)\n", - " Downloading gradio_client-1.5.2-py3-none-any.whl.metadata (7.1 kB)\n", - "Requirement already satisfied: httpx>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from gradio) (0.28.1)\n", - "Requirement already satisfied: huggingface-hub>=0.25.1 in /usr/local/lib/python3.10/dist-packages (from gradio) (0.27.0)\n", - "Requirement already satisfied: jinja2<4.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.1.4)\n", - "Collecting markupsafe~=2.0 (from gradio)\n", - " Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)\n", - "Requirement already satisfied: numpy<3.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (1.26.4)\n", - "Requirement already satisfied: orjson~=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.10.12)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from gradio) (24.2)\n", - "Requirement already satisfied: pandas<3.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (2.2.2)\n", - "Requirement already satisfied: pillow<12.0,>=8.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (11.0.0)\n", - "Requirement already satisfied: pydantic>=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (2.10.3)\n", - "Collecting pydub (from gradio)\n", - " Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)\n", - "Collecting python-multipart>=0.0.18 (from gradio)\n", - " Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)\n", - "Requirement already satisfied: pyyaml<7.0,>=5.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (6.0.2)\n", - "Collecting ruff>=0.2.2 (from gradio)\n", - " Downloading ruff-0.8.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n", - "Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)\n", - " Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)\n", - "Collecting semantic-version~=2.0 (from gradio)\n", - " Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)\n", - "Collecting starlette<1.0,>=0.40.0 (from gradio)\n", - " Downloading starlette-0.42.0-py3-none-any.whl.metadata (6.0 kB)\n", - "Collecting tomlkit<0.14.0,>=0.12.0 (from gradio)\n", - " Downloading tomlkit-0.13.2-py3-none-any.whl.metadata (2.7 kB)\n", - "Requirement already satisfied: typer<1.0,>=0.12 in /usr/local/lib/python3.10/dist-packages (from gradio) (0.15.1)\n", - "Requirement already satisfied: typing-extensions~=4.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (4.12.2)\n", - "Collecting uvicorn>=0.14.0 (from gradio)\n", - " Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)\n", - "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from gradio-client==1.5.2->gradio) (2024.10.0)\n", - "Requirement already satisfied: websockets<15.0,>=10.0 in /usr/local/lib/python3.10/dist-packages (from gradio-client==1.5.2->gradio) (14.1)\n", - "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio) (3.10)\n", - "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio) (1.3.1)\n", - "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio) (1.2.2)\n", - "Collecting starlette<1.0,>=0.40.0 (from gradio)\n", - " Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio) (2024.12.14)\n", - "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio) (1.0.7)\n", - "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx>=0.24.1->gradio) (0.14.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.25.1->gradio) (3.16.1)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.25.1->gradio) (2.32.3)\n", - "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.25.1->gradio) (4.67.1)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio) (2024.2)\n", - "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio) (2024.2)\n", - "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio) (0.7.0)\n", - "Requirement already satisfied: pydantic-core==2.27.1 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio) (2.27.1)\n", - "Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio) (8.1.7)\n", - "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio) (1.5.4)\n", - "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio) (13.9.4)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas<3.0,>=1.0->gradio) (1.17.0)\n", - "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (3.0.0)\n", - "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (2.18.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.25.1->gradio) (3.4.0)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.25.1->gradio) (2.2.3)\n", - "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0,>=0.12->gradio) (0.1.2)\n", - "Downloading gradio-5.9.1-py3-none-any.whl (57.2 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.2/57.2 MB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading gradio_client-1.5.2-py3-none-any.whl (320 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m320.4/320.4 kB\u001b[0m \u001b[31m24.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading aiofiles-23.2.1-py3-none-any.whl (15 kB)\n", - "Downloading fastapi-0.115.6-py3-none-any.whl (94 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25 kB)\n", - "Downloading python_multipart-0.0.20-py3-none-any.whl (24 kB)\n", - "Downloading ruff-0.8.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.2 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.2/11.2 MB\u001b[0m \u001b[31m117.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading safehttpx-0.1.6-py3-none-any.whl (8.7 kB)\n", - "Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n", - "Downloading starlette-0.41.3-py3-none-any.whl (73 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.2/73.2 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading tomlkit-0.13.2-py3-none-any.whl (37 kB)\n", - "Downloading uvicorn-0.34.0-py3-none-any.whl (62 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.3/62.3 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading ffmpy-0.5.0-py3-none-any.whl (6.0 kB)\n", - "Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n", - "Installing collected packages: pydub, uvicorn, tomlkit, semantic-version, ruff, python-multipart, markupsafe, ffmpy, aiofiles, starlette, safehttpx, gradio-client, fastapi, gradio\n", - " Attempting uninstall: markupsafe\n", - " Found existing installation: MarkupSafe 3.0.2\n", - " Uninstalling MarkupSafe-3.0.2:\n", - " Successfully uninstalled MarkupSafe-3.0.2\n", - "Successfully installed aiofiles-23.2.1 fastapi-0.115.6 ffmpy-0.5.0 gradio-5.9.1 gradio-client-1.5.2 markupsafe-2.1.5 pydub-0.25.1 python-multipart-0.0.20 ruff-0.8.4 safehttpx-0.1.6 semantic-version-2.10.0 starlette-0.41.3 tomlkit-0.13.2 uvicorn-0.34.0\n" - ] - } - ], - "source": [ - "! pip install langchain lancedb\n", - "! pip install tantivy\n", - "! pip install -U langchain-openai langchain-community pypdf\n", - "! pip install gradio" - ] + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# FarmerGPT - Build Chat with PDF using lancedb" + ], + "metadata": { + "id": "10yLPb9Fneh0" + } + }, + { + "cell_type": "markdown", + "source": [ + "We utilized this PDF: Dataset/FARM5CROPS_farmergpt.pdf,\n", + " which contains crop variety information for sugarcane, turmeric, bamboo, cashew nuts, and more.\n", + " This is a sample project designed to demonstrate how to build an application using LanceDB and LangChain\n", + "The use case and prompts can be customized as needed to suit specific requirements" + ], + "metadata": { + "id": "xFHTAtayosS-" + } + }, + { + "cell_type": "markdown", + "source": [ + "Import pacages" + ], + "metadata": { + "id": "eXsCZEUvb39u" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "EaKlPN9u6bnc", + "outputId": "89f72bd3-ddf9-4ccc-a249-50d50e4012be" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "# pass opeani key or use any LLM\n", - "import os\n", - "os.environ[\"OPENAI_API_KEY\"] = \"sk-proj-\"" - ], - "metadata": { - "id": "ylj23ZpC-SG1" - }, - "execution_count": 7, - "outputs": [] + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: langchain in /usr/local/lib/python3.10/dist-packages (0.3.12)\n", + "Collecting lancedb\n", + " Downloading lancedb-0.17.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (4.7 kB)\n", + "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.2)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.36)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.11.10)\n", + "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n", + "Requirement already satisfied: langchain-core<0.4.0,>=0.3.25 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.3.25)\n", + "Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.3.3)\n", + "Requirement already satisfied: langsmith<0.3,>=0.1.17 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.2.3)\n", + "Requirement already satisfied: numpy<2,>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.26.4)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.10.3)\n", + "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.32.3)\n", + "Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (9.0.0)\n", + "Collecting deprecation (from lancedb)\n", + " Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)\n", + "Collecting pylance==0.20.0 (from lancedb)\n", + " Downloading pylance-0.20.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (7.4 kB)\n", + "Requirement already satisfied: tqdm>=4.27.0 in /usr/local/lib/python3.10/dist-packages (from lancedb) (4.67.1)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from lancedb) (24.2)\n", + "Collecting overrides>=0.7 (from lancedb)\n", + " Downloading overrides-7.7.0-py3-none-any.whl.metadata (5.8 kB)\n", + "Requirement already satisfied: pyarrow>=14 in /usr/local/lib/python3.10/dist-packages (from pylance==0.20.0->lancedb) (17.0.0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (2.4.4)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.2)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (24.3.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.5.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.1.0)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (0.2.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.18.3)\n", + "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.25->langchain) (1.33)\n", + "Requirement already satisfied: typing-extensions>=4.7 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.25->langchain) (4.12.2)\n", + "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.17->langchain) (0.28.1)\n", + "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.17->langchain) (3.10.12)\n", + "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.17->langchain) (1.0.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.27.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain) (2.27.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.2.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2024.12.14)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.1.1)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (3.7.1)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (1.0.7)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (0.14.0)\n", + "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.25->langchain) (3.0.0)\n", + "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (1.3.1)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.17->langchain) (1.2.2)\n", + "Downloading lancedb-0.17.0-cp39-abi3-manylinux_2_28_x86_64.whl (29.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m29.9/29.9 MB\u001b[0m \u001b[31m51.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading pylance-0.20.0-cp39-abi3-manylinux_2_28_x86_64.whl (33.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m33.5/33.5 MB\u001b[0m \u001b[31m16.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading overrides-7.7.0-py3-none-any.whl (17 kB)\n", + "Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)\n", + "Installing collected packages: overrides, deprecation, pylance, lancedb\n", + "Successfully installed deprecation-2.1.0 lancedb-0.17.0 overrides-7.7.0 pylance-0.20.0\n", + "Collecting tantivy\n", + " Downloading tantivy-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.2 kB)\n", + "Downloading tantivy-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m40.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: tantivy\n", + "Successfully installed tantivy-0.22.0\n", + "Collecting langchain-openai\n", + " Downloading langchain_openai-0.2.14-py3-none-any.whl.metadata (2.7 kB)\n", + "Collecting langchain-community\n", + " Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)\n", + "Collecting pypdf\n", + " Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)\n", + "Collecting langchain-core<0.4.0,>=0.3.27 (from langchain-openai)\n", + " Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)\n", + "Collecting openai<2.0.0,>=1.58.1 (from langchain-openai)\n", + " Downloading openai-1.58.1-py3-none-any.whl.metadata (27 kB)\n", + "Collecting tiktoken<1,>=0.7 (from langchain-openai)\n", + " Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n", + "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (6.0.2)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.0.36)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (3.11.10)\n", + "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)\n", + " Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)\n", + "Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)\n", + " Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)\n", + "Collecting langchain<0.4.0,>=0.3.13 (from langchain-community)\n", + " Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)\n", + "Requirement already satisfied: langsmith<0.3,>=0.1.125 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.2.3)\n", + "Requirement already satisfied: numpy<2,>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (1.26.4)\n", + "Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)\n", + " Downloading pydantic_settings-2.7.0-py3-none-any.whl.metadata (3.5 kB)\n", + "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.32.3)\n", + "Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (9.0.0)\n", + "Requirement already satisfied: typing_extensions>=4.0 in /usr/local/lib/python3.10/dist-packages (from pypdf) (4.12.2)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (2.4.4)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.3.2)\n", + "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (4.0.3)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (24.3.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.5.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (6.1.0)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (0.2.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.18.3)\n", + "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)\n", + " Downloading marshmallow-3.23.2-py3-none-any.whl.metadata (7.1 kB)\n", + "Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)\n", + " Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)\n", + "Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.3 in /usr/local/lib/python3.10/dist-packages (from langchain<0.4.0,>=0.3.13->langchain-community) (0.3.3)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /usr/local/lib/python3.10/dist-packages (from langchain<0.4.0,>=0.3.13->langchain-community) (2.10.3)\n", + "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.27->langchain-openai) (1.33)\n", + "Requirement already satisfied: packaging<25,>=23.2 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.27->langchain-openai) (24.2)\n", + "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.125->langchain-community) (0.28.1)\n", + "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.125->langchain-community) (3.10.12)\n", + "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.125->langchain-community) (1.0.0)\n", + "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.58.1->langchain-openai) (3.7.1)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.58.1->langchain-openai) (1.9.0)\n", + "Requirement already satisfied: jiter<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.58.1->langchain-openai) (0.8.2)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.58.1->langchain-openai) (1.3.1)\n", + "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.58.1->langchain-openai) (4.67.1)\n", + "Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)\n", + " Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2.2.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2024.12.14)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain-community) (3.1.1)\n", + "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken<1,>=0.7->langchain-openai) (2024.11.6)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=1.58.1->langchain-openai) (1.2.2)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (1.0.7)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (0.14.0)\n", + "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.27->langchain-openai) (3.0.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.13->langchain-community) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.27.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.13->langchain-community) (2.27.1)\n", + "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)\n", + " Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)\n", + "Downloading langchain_openai-0.2.14-py3-none-any.whl (50 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.9/50.9 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading langchain_community-0.3.13-py3-none-any.whl (2.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.5/2.5 MB\u001b[0m \u001b[31m50.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading pypdf-5.1.0-py3-none-any.whl (297 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.0/298.0 kB\u001b[0m \u001b[31m27.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)\n", + "Downloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)\n", + "Downloading langchain-0.3.13-py3-none-any.whl (1.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m57.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading langchain_core-0.3.28-py3-none-any.whl (411 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m411.6/411.6 kB\u001b[0m \u001b[31m33.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading openai-1.58.1-py3-none-any.whl (454 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.3/454.3 kB\u001b[0m \u001b[31m26.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading pydantic_settings-2.7.0-py3-none-any.whl (29 kB)\n", + "Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m68.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading marshmallow-3.23.2-py3-none-any.whl (49 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n", + "Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", + "Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", + "Installing collected packages: python-dotenv, pypdf, mypy-extensions, marshmallow, httpx-sse, typing-inspect, tiktoken, pydantic-settings, openai, dataclasses-json, langchain-core, langchain-openai, langchain, langchain-community\n", + " Attempting uninstall: openai\n", + " Found existing installation: openai 1.57.4\n", + " Uninstalling openai-1.57.4:\n", + " Successfully uninstalled openai-1.57.4\n", + " Attempting uninstall: langchain-core\n", + " Found existing installation: langchain-core 0.3.25\n", + " Uninstalling langchain-core-0.3.25:\n", + " Successfully uninstalled langchain-core-0.3.25\n", + " Attempting uninstall: langchain\n", + " Found existing installation: langchain 0.3.12\n", + " Uninstalling langchain-0.3.12:\n", + " Successfully uninstalled langchain-0.3.12\n", + "Successfully installed dataclasses-json-0.6.7 httpx-sse-0.4.0 langchain-0.3.13 langchain-community-0.3.13 langchain-core-0.3.28 langchain-openai-0.2.14 marshmallow-3.23.2 mypy-extensions-1.0.0 openai-1.58.1 pydantic-settings-2.7.0 pypdf-5.1.0 python-dotenv-1.0.1 tiktoken-0.8.0 typing-inspect-0.9.0\n", + "Collecting gradio\n", + " Downloading gradio-5.9.1-py3-none-any.whl.metadata (16 kB)\n", + "Collecting aiofiles<24.0,>=22.0 (from gradio)\n", + " Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)\n", + "Requirement already satisfied: anyio<5.0,>=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.7.1)\n", + "Collecting fastapi<1.0,>=0.115.2 (from gradio)\n", + " Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)\n", + "Collecting ffmpy (from gradio)\n", + " Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)\n", + "Collecting gradio-client==1.5.2 (from gradio)\n", + " Downloading gradio_client-1.5.2-py3-none-any.whl.metadata (7.1 kB)\n", + "Requirement already satisfied: httpx>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from gradio) (0.28.1)\n", + "Requirement already satisfied: huggingface-hub>=0.25.1 in /usr/local/lib/python3.10/dist-packages (from gradio) (0.27.0)\n", + "Requirement already satisfied: jinja2<4.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.1.4)\n", + "Collecting markupsafe~=2.0 (from gradio)\n", + " Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)\n", + "Requirement already satisfied: numpy<3.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (1.26.4)\n", + "Requirement already satisfied: orjson~=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.10.12)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from gradio) (24.2)\n", + "Requirement already satisfied: pandas<3.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (2.2.2)\n", + "Requirement already satisfied: pillow<12.0,>=8.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (11.0.0)\n", + "Requirement already satisfied: pydantic>=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (2.10.3)\n", + "Collecting pydub (from gradio)\n", + " Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)\n", + "Collecting python-multipart>=0.0.18 (from gradio)\n", + " Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)\n", + "Requirement already satisfied: pyyaml<7.0,>=5.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (6.0.2)\n", + "Collecting ruff>=0.2.2 (from gradio)\n", + " Downloading ruff-0.8.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n", + "Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)\n", + " Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)\n", + "Collecting semantic-version~=2.0 (from gradio)\n", + " Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)\n", + "Collecting starlette<1.0,>=0.40.0 (from gradio)\n", + " Downloading starlette-0.42.0-py3-none-any.whl.metadata (6.0 kB)\n", + "Collecting tomlkit<0.14.0,>=0.12.0 (from gradio)\n", + " Downloading tomlkit-0.13.2-py3-none-any.whl.metadata (2.7 kB)\n", + "Requirement already satisfied: typer<1.0,>=0.12 in /usr/local/lib/python3.10/dist-packages (from gradio) (0.15.1)\n", + "Requirement already satisfied: typing-extensions~=4.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (4.12.2)\n", + "Collecting uvicorn>=0.14.0 (from gradio)\n", + " Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from gradio-client==1.5.2->gradio) (2024.10.0)\n", + "Requirement already satisfied: websockets<15.0,>=10.0 in /usr/local/lib/python3.10/dist-packages (from gradio-client==1.5.2->gradio) (14.1)\n", + "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio) (3.10)\n", + "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio) (1.3.1)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio) (1.2.2)\n", + "Collecting starlette<1.0,>=0.40.0 (from gradio)\n", + " Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio) (2024.12.14)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio) (1.0.7)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx>=0.24.1->gradio) (0.14.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.25.1->gradio) (3.16.1)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.25.1->gradio) (2.32.3)\n", + "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.25.1->gradio) (4.67.1)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio) (2024.2)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.27.1 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio) (2.27.1)\n", + "Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio) (8.1.7)\n", + "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio) (1.5.4)\n", + "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio) (13.9.4)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas<3.0,>=1.0->gradio) (1.17.0)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (2.18.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.25.1->gradio) (3.4.0)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.25.1->gradio) (2.2.3)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0,>=0.12->gradio) (0.1.2)\n", + "Downloading gradio-5.9.1-py3-none-any.whl (57.2 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.2/57.2 MB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading gradio_client-1.5.2-py3-none-any.whl (320 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m320.4/320.4 kB\u001b[0m \u001b[31m24.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading aiofiles-23.2.1-py3-none-any.whl (15 kB)\n", + "Downloading fastapi-0.115.6-py3-none-any.whl (94 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25 kB)\n", + "Downloading python_multipart-0.0.20-py3-none-any.whl (24 kB)\n", + "Downloading ruff-0.8.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.2 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.2/11.2 MB\u001b[0m \u001b[31m117.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading safehttpx-0.1.6-py3-none-any.whl (8.7 kB)\n", + "Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n", + "Downloading starlette-0.41.3-py3-none-any.whl (73 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.2/73.2 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading tomlkit-0.13.2-py3-none-any.whl (37 kB)\n", + "Downloading uvicorn-0.34.0-py3-none-any.whl (62 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.3/62.3 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading ffmpy-0.5.0-py3-none-any.whl (6.0 kB)\n", + "Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n", + "Installing collected packages: pydub, uvicorn, tomlkit, semantic-version, ruff, python-multipart, markupsafe, ffmpy, aiofiles, starlette, safehttpx, gradio-client, fastapi, gradio\n", + " Attempting uninstall: markupsafe\n", + " Found existing installation: MarkupSafe 3.0.2\n", + " Uninstalling MarkupSafe-3.0.2:\n", + " Successfully uninstalled MarkupSafe-3.0.2\n", + "Successfully installed aiofiles-23.2.1 fastapi-0.115.6 ffmpy-0.5.0 gradio-5.9.1 gradio-client-1.5.2 markupsafe-2.1.5 pydub-0.25.1 python-multipart-0.0.20 ruff-0.8.4 safehttpx-0.1.6 semantic-version-2.10.0 starlette-0.41.3 tomlkit-0.13.2 uvicorn-0.34.0\n" + ] + } + ], + "source": [ + "! pip install langchain lancedb\n", + "! pip install tantivy\n", + "! pip install -U langchain-openai langchain-community pypdf\n", + "! pip install gradio" + ] + }, + { + "cell_type": "code", + "source": [ + "# pass opeani key or use any LLM\n", + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"sk-proj-\"" + ], + "metadata": { + "id": "ylj23ZpC-SG1" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Download the sample pdf\n", + "!wget https://github.com/vectordb-recipes/examples/raw/main/RAG-On-PDF/Dataset/FARM5CROPS_farmergpt.pdf -O FARM5CROPS_farmergpt.pdf" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - + "id": "xtRI4pTPvahR", + "outputId": "02f3b2dc-4595-403e-e3d2-068fa4b73c4a" + }, + "execution_count": 4, + "outputs": [ { - "cell_type": "code", - "source": [ - "# Download the sample pdf\n", - "!wget https://github.com/vectordb-recipes/examples/raw/main/RAG-On-PDF/Dataset/FARM5CROPS_farmergpt.pdf -O FARM5CROPS_farmergpt.pdf" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "xtRI4pTPvahR", - "outputId": "02f3b2dc-4595-403e-e3d2-068fa4b73c4a" - }, - "execution_count": 4, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2024-12-20 11:04:42-- https://github.com/vectordb-recipes/examples/raw/main/RAG-On-PDF/Dataset/FARM5CROPS_farmergpt.pdf\n", - "Resolving github.com (github.com)... 140.82.112.3\n", - "Connecting to github.com (github.com)|140.82.112.3|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://raw.githubusercontent.com/vectordb-recipes/examples/main/RAG-On-PDF/Dataset/FARM5CROPS_farmergp.pdf [following]\n", - "--2024-12-20 11:04:43-- https://raw.githubusercontent.com/vectordb-recipes/examples/main/RAG-On-PDF/Dataset/FARM5CROPS_farmergpt.pdf\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1398674 (1.3M) [application/octet-stream]\n", - "Saving to: ‘FARM5CROPS_farmergpt.pdf’\n", - "\n", - "FARM5CROPS_farmergp 100%[===================>] 1.33M --.-KB/s in 0.04s \n", - "\n", - "2024-12-20 11:04:43 (34.5 MB/s) - ‘FARM5CROPS_farmergpt.pdf’ saved [1398674/1398674]\n", - "\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "--2024-12-20 11:04:42-- https://github.com/vectordb-recipes/examples/raw/main/RAG-On-PDF/Dataset/FARM5CROPS_farmergpt.pdf\n", + "Resolving github.com (github.com)... 140.82.112.3\n", + "Connecting to github.com (github.com)|140.82.112.3|:443... connected.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://raw.githubusercontent.com/vectordb-recipes/examples/main/RAG-On-PDF/Dataset/FARM5CROPS_farmergp.pdf [following]\n", + "--2024-12-20 11:04:43-- https://raw.githubusercontent.com/vectordb-recipes/examples/main/RAG-On-PDF/Dataset/FARM5CROPS_farmergpt.pdf\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1398674 (1.3M) [application/octet-stream]\n", + "Saving to: ‘FARM5CROPS_farmergpt.pdf’\n", + "\n", + "FARM5CROPS_farmergp 100%[===================>] 1.33M --.-KB/s in 0.04s \n", + "\n", + "2024-12-20 11:04:43 (34.5 MB/s) - ‘FARM5CROPS_farmergpt.pdf’ saved [1398674/1398674]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from lancedb.rerankers import LinearCombinationReranker\n", + "from langchain_community.document_loaders import PyPDFLoader\n", + "from langchain_community.vectorstores import LanceDB\n", + "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "from langchain_text_splitters import CharacterTextSplitter\n", + "from langchain_core.prompts import PromptTemplate\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain.memory import ConversationBufferMemory\n", + "\n", + "\n", + "class QueryProcessor:\n", + " def __init__(self, file_path, db_url=\"lancedb_temp\", table_name=\"lancedb_indic\"):\n", + " \"\"\"\n", + " Initialize the QueryProcessor with the PDF file and set up the vector store.\n", + "\n", + " Parameters:\n", + " file_path (str): Path to the PDF file.\n", + " db_url (str): URI for the LanceDB vector store.\n", + " table_name (str): Name of the table in LanceDB.\n", + " \"\"\"\n", + " # Load and process the PDF document\n", + " loader = PyPDFLoader(file_path)\n", + " documents = loader.load()\n", + " text_splitter = CharacterTextSplitter()\n", + " self.documents = text_splitter.split_documents(documents)\n", + "\n", + " # Initialize embeddings and vector store\n", + " embeddings = OpenAIEmbeddings()\n", + " self.vector_store = LanceDB(\n", + " uri=db_url, embedding=embeddings, table_name=table_name\n", + " )\n", + "\n", + " # Add reranker\n", + " self.reranker = LinearCombinationReranker(weight=0.3)\n", + " self.docsearch = LanceDB.from_documents(\n", + " self.documents, embeddings, reranker=self.reranker\n", + " )\n", + "\n", + " print(\"Embedding stored in lancedb\")\n", + " # Initialize LLM and memory\n", + " self.llm = ChatOpenAI(\n", + " model_name=\"gpt-4o\",\n", + " temperature=0.01,\n", + " )\n", + " self.memory = ConversationBufferMemory(memory_key=\"chat_history\")\n", + "\n", + " def generate_prompt_template(\n", + " self, main_instructions, prompt_instructions, context_name, query\n", + " ):\n", + " \"\"\"\n", + " Generate a prompt template for LangChain LLM.\n", + "\n", + " Parameters:\n", + " main_instructions (str): Main instructions for the LLM.\n", + " prompt_instructions (str): Additional instructions for how to use the data.\n", + " context_name (str): The name of the context (e.g., search results).\n", + " query (str): The query from the user.\n", + "\n", + " Returns:\n", + " PromptTemplate: The generated prompt template.\n", + " \"\"\"\n", + " template = f\"\"\"{main_instructions}\n", + "\n", + " {prompt_instructions}\n", + "\n", + " {context_name}:\n", + " {{context}}\n", + "\n", + " Previous Conversations:\n", + " {{chat_history}}\n", + " Human: {query}\n", + " Chatbot:\"\"\"\n", + " return PromptTemplate(\n", + " template=template, input_variables=[\"context\", \"chat_history\"]\n", + " )\n", + "\n", + " def get_answer(self, query):\n", + " \"\"\"\n", + " Process a query and return the answer based on the preloaded PDF.\n", + "\n", + " Parameters:\n", + " query (str): The user's query.\n", + "\n", + " Returns:\n", + " str: The answer to the query.\n", + " \"\"\"\n", + " # Perform similarity search\n", + " docs = self.docsearch.similarity_search_with_relevance_scores(query)\n", + "\n", + " # Generate a prompt\n", + " prompt = self.generate_prompt_template(\n", + " main_instructions=\"Act as a knowledgeable assistant. Answer the query comprehensively and concisely based on the provided content.\",\n", + " prompt_instructions=(\n", + " \"Focus on extracting the most relevant and accurate information from the context. \"\n", + " \"Prioritize clarity, conciseness, and detail in your response. \"\n", + " \"When summarizing, ensure key points are highlighted without losing important nuances. \"\n", + " \"If the context is insufficient to fully address the query, acknowledge the limitation clearly.\"\n", + " ),\n", + " context_name=\"PDF Content\",\n", + " query=query,\n", + " )\n", + "\n", + " # Create the LangChain pipeline\n", + " chain = prompt | self.llm | StrOutputParser()\n", + "\n", + " # Invoke the chain and get the answer\n", + " answer = chain.invoke({\"context\": docs, \"chat_history\": self.memory})\n", + " return answer\n", + "\n", + "\n", + "# Initialize the QueryProcessor with the PDF file (done once)\n", + "file_path = \"/content/FARM5CROPS_farmergpt.pdf\"\n", + "query_processor = QueryProcessor(file_path)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "Pzs-a0Kcdokq", + "outputId": "a68fa8d5-edb0-4e7d-964c-4d151ad10a28" + }, + "execution_count": 14, + "outputs": [ { - "cell_type": "code", - "source": [ - "from lancedb.rerankers import LinearCombinationReranker\n", - "from langchain_community.document_loaders import PyPDFLoader\n", - "from langchain_community.vectorstores import LanceDB\n", - "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", - "from langchain_text_splitters import CharacterTextSplitter\n", - "from langchain_core.prompts import PromptTemplate\n", - "from langchain_core.output_parsers import StrOutputParser\n", - "from langchain.memory import ConversationBufferMemory\n", - "\n", - "class QueryProcessor:\n", - " def __init__(self, file_path, db_url=\"lancedb_temp\", table_name=\"lancedb_indic\"):\n", - " \"\"\"\n", - " Initialize the QueryProcessor with the PDF file and set up the vector store.\n", - "\n", - " Parameters:\n", - " file_path (str): Path to the PDF file.\n", - " db_url (str): URI for the LanceDB vector store.\n", - " table_name (str): Name of the table in LanceDB.\n", - " \"\"\"\n", - " # Load and process the PDF document\n", - " loader = PyPDFLoader(file_path)\n", - " documents = loader.load()\n", - " text_splitter = CharacterTextSplitter()\n", - " self.documents = text_splitter.split_documents(documents)\n", - "\n", - " # Initialize embeddings and vector store\n", - " embeddings = OpenAIEmbeddings()\n", - " self.vector_store = LanceDB(\n", - " uri=db_url,\n", - " embedding=embeddings,\n", - " table_name=table_name\n", - " )\n", - "\n", - " # Add reranker\n", - " self.reranker = LinearCombinationReranker(weight=0.3)\n", - " self.docsearch = LanceDB.from_documents(\n", - " self.documents, embeddings, reranker=self.reranker\n", - " )\n", - "\n", - " print(\"Embedding stored in lancedb\")\n", - " # Initialize LLM and memory\n", - " self.llm = ChatOpenAI(\n", - " model_name=\"gpt-4o\",\n", - " temperature=0.01,\n", - " )\n", - " self.memory = ConversationBufferMemory(memory_key=\"chat_history\")\n", - "\n", - " def generate_prompt_template(self, main_instructions, prompt_instructions, context_name, query):\n", - " \"\"\"\n", - " Generate a prompt template for LangChain LLM.\n", - "\n", - " Parameters:\n", - " main_instructions (str): Main instructions for the LLM.\n", - " prompt_instructions (str): Additional instructions for how to use the data.\n", - " context_name (str): The name of the context (e.g., search results).\n", - " query (str): The query from the user.\n", - "\n", - " Returns:\n", - " PromptTemplate: The generated prompt template.\n", - " \"\"\"\n", - " template = f\"\"\"{main_instructions}\n", - "\n", - " {prompt_instructions}\n", - "\n", - " {context_name}:\n", - " {{context}}\n", - "\n", - " Previous Conversations:\n", - " {{chat_history}}\n", - " Human: {query}\n", - " Chatbot:\"\"\"\n", - " return PromptTemplate(template=template, input_variables=[\"context\", \"chat_history\"])\n", - "\n", - " def get_answer(self, query):\n", - " \"\"\"\n", - " Process a query and return the answer based on the preloaded PDF.\n", - "\n", - " Parameters:\n", - " query (str): The user's query.\n", - "\n", - " Returns:\n", - " str: The answer to the query.\n", - " \"\"\"\n", - " # Perform similarity search\n", - " docs = self.docsearch.similarity_search_with_relevance_scores(query)\n", - "\n", - " # Generate a prompt\n", - " prompt = self.generate_prompt_template(\n", - " main_instructions=\"Act as a knowledgeable assistant. Answer the query comprehensively and concisely based on the provided content.\",\n", - " prompt_instructions=(\n", - " \"Focus on extracting the most relevant and accurate information from the context. \"\n", - " \"Prioritize clarity, conciseness, and detail in your response. \"\n", - " \"When summarizing, ensure key points are highlighted without losing important nuances. \"\n", - " \"If the context is insufficient to fully address the query, acknowledge the limitation clearly.\"\n", - " ),\n", - " context_name=\"PDF Content\",\n", - " query=query,\n", - " )\n", - "\n", - " # Create the LangChain pipeline\n", - " chain = prompt | self.llm | StrOutputParser()\n", - "\n", - " # Invoke the chain and get the answer\n", - " answer = chain.invoke({\"context\": docs, \"chat_history\": self.memory})\n", - " return answer\n", - "\n", - "# Initialize the QueryProcessor with the PDF file (done once)\n", - "file_path = \"/content/FARM5CROPS_farmergpt.pdf\"\n", - "query_processor = QueryProcessor(file_path)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Pzs-a0Kcdokq", - "outputId": "a68fa8d5-edb0-4e7d-964c-4d151ad10a28" - }, - "execution_count": 14, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Embedding stored in lancedb\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "Embedding stored in lancedb\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "query = \"give me some sugarcane variety names?\"\n", + "answer = query_processor.get_answer(query)\n", + "print(\"Answer:\", answer)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "dt7rIDwDdonc", + "outputId": "489273aa-b269-4fdf-9cab-cae516f0c766" + }, + "execution_count": 15, + "outputs": [ { - "cell_type": "code", - "source": [ - "query = \"give me some sugarcane variety names?\"\n", - "answer = query_processor.get_answer(query)\n", - "print(\"Answer:\", answer)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "dt7rIDwDdonc", - "outputId": "489273aa-b269-4fdf-9cab-cae516f0c766" - }, - "execution_count": 15, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Answer: The sugarcane varieties are categorized by states in India as follows:\n", - "\n", - "- **Andhra Pradesh:**\n", - " - Early varieties: Co.6907, 84A125, 81A99, 83A30, 85A261, 87A298, Co.8014, 86V96, 91V83.\n", - " - Mid-late varieties: COA7607, CO8021, COT.8201, Co7805, COV92102 (83V15), 83V288.\n", - " - Late varieties: Co.7219, CoR8001, 87A380, Co7706.\n", - "\n", - "- **Bihar:**\n", - " - Varieties: Bo 99, CoP 9301, CoSe 98231, CoS 8436, Cos 95255, Bo 102, Bo 91, Bo 110, CoP 9206, CoSe 95422, CoSe 92423, UP 9530.\n", - "\n", - "- **Gujarat:**\n", - " - Varieties: Co 86002, Co 86032, CoSi 95071, Co 86249, CoN 05072.\n", - "\n", - "- **Haryana:**\n", - " - Varieties: CoJ 64, CoS 8436, CoS 88230, CoS 767.\n", - "\n", - "- **Karnataka:**\n", - " - Varieties: Co 94012, CoC 671, Co 92020, Co 8014, Co 86032, Co 62175, Co 8371, Co 740, Co 8011.\n", - "\n", - "- **Maharashtra:**\n", - " - Varieties: CoC 671, Co 86032, Co 8011, Co 94012, CoM 265, Co 92005.\n", - "\n", - "- **Odisha:**\n", - " - Varieties: Co 62175, CoA 89085, Co 87A298, Co86V96.\n", - "\n", - "- **Punjab:**\n", - " - Varieties: CoJ 85, CoJ 88, CoS8436, CoH 119, Co89003.\n", - "\n", - "- **Tamil Nadu:**\n", - " - Varieties: Co 94012, Co 94010, CoC 24.\n", - "\n", - "- **Uttar Pradesh:**\n", - " - Varieties: CoS 8436, Coj 64, CoS88230, CoS 98231, CoS 767, CoS 8432, CoPt 90223, CoS 92423, CoS97264, CoLk 8102.\n", - "\n", - "- **Uttrakhand:**\n", - " - Varieties: CoS 8436, CoS 88230, Cos 767, CoS 97264, CoSe 92423.\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "Answer: The sugarcane varieties are categorized by states in India as follows:\n", + "\n", + "- **Andhra Pradesh:**\n", + " - Early varieties: Co.6907, 84A125, 81A99, 83A30, 85A261, 87A298, Co.8014, 86V96, 91V83.\n", + " - Mid-late varieties: COA7607, CO8021, COT.8201, Co7805, COV92102 (83V15), 83V288.\n", + " - Late varieties: Co.7219, CoR8001, 87A380, Co7706.\n", + "\n", + "- **Bihar:**\n", + " - Varieties: Bo 99, CoP 9301, CoSe 98231, CoS 8436, Cos 95255, Bo 102, Bo 91, Bo 110, CoP 9206, CoSe 95422, CoSe 92423, UP 9530.\n", + "\n", + "- **Gujarat:**\n", + " - Varieties: Co 86002, Co 86032, CoSi 95071, Co 86249, CoN 05072.\n", + "\n", + "- **Haryana:**\n", + " - Varieties: CoJ 64, CoS 8436, CoS 88230, CoS 767.\n", + "\n", + "- **Karnataka:**\n", + " - Varieties: Co 94012, CoC 671, Co 92020, Co 8014, Co 86032, Co 62175, Co 8371, Co 740, Co 8011.\n", + "\n", + "- **Maharashtra:**\n", + " - Varieties: CoC 671, Co 86032, Co 8011, Co 94012, CoM 265, Co 92005.\n", + "\n", + "- **Odisha:**\n", + " - Varieties: Co 62175, CoA 89085, Co 87A298, Co86V96.\n", + "\n", + "- **Punjab:**\n", + " - Varieties: CoJ 85, CoJ 88, CoS8436, CoH 119, Co89003.\n", + "\n", + "- **Tamil Nadu:**\n", + " - Varieties: Co 94012, Co 94010, CoC 24.\n", + "\n", + "- **Uttar Pradesh:**\n", + " - Varieties: CoS 8436, Coj 64, CoS88230, CoS 98231, CoS 767, CoS 8432, CoPt 90223, CoS 92423, CoS97264, CoLk 8102.\n", + "\n", + "- **Uttrakhand:**\n", + " - Varieties: CoS 8436, CoS 88230, Cos 767, CoS 97264, CoSe 92423.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Reuse with another query\n", + "query2 = \"What crops grow in dry regions?\"\n", + "answer2 = query_processor.get_answer(query2)\n", + "print(\"Answer:\", answer2)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "lGUJMdVodorC", + "outputId": "4909b2ff-e97a-45be-83f3-71894e2a9162" + }, + "execution_count": 16, + "outputs": [ { - "cell_type": "code", - "source": [ - "# Reuse with another query\n", - "query2 = \"What crops grow in dry regions?\"\n", - "answer2 = query_processor.get_answer(query2)\n", - "print(\"Answer:\", answer2)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "lGUJMdVodorC", - "outputId": "4909b2ff-e97a-45be-83f3-71894e2a9162" - }, - "execution_count": 16, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Answer: In dry regions, crops that are typically grown include those that are drought-resistant and can thrive with minimal water. Some common crops suitable for dry regions are:\n", - "\n", - "1. **Millets**: These are hardy grains that require less water and can grow in poor soil conditions. Examples include pearl millet (bajra) and finger millet (ragi).\n", - "\n", - "2. **Sorghum**: Known for its drought tolerance, sorghum is a staple in many dry areas.\n", - "\n", - "3. **Pulses**: Legumes such as chickpeas, lentils, and pigeon peas are often grown in dry regions due to their ability to fix nitrogen and improve soil fertility.\n", - "\n", - "4. **Oilseeds**: Crops like sesame and mustard are suitable for dry climates.\n", - "\n", - "5. **Cotton**: This crop can be grown in semi-arid regions with proper irrigation management.\n", - "\n", - "6. **Cactus and Succulents**: While not traditional crops, these plants are increasingly being explored for their potential in arid agriculture.\n", - "\n", - "These crops are chosen for their ability to withstand water scarcity and their adaptability to the challenging conditions of dry regions.\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "Answer: In dry regions, crops that are typically grown include those that are drought-resistant and can thrive with minimal water. Some common crops suitable for dry regions are:\n", + "\n", + "1. **Millets**: These are hardy grains that require less water and can grow in poor soil conditions. Examples include pearl millet (bajra) and finger millet (ragi).\n", + "\n", + "2. **Sorghum**: Known for its drought tolerance, sorghum is a staple in many dry areas.\n", + "\n", + "3. **Pulses**: Legumes such as chickpeas, lentils, and pigeon peas are often grown in dry regions due to their ability to fix nitrogen and improve soil fertility.\n", + "\n", + "4. **Oilseeds**: Crops like sesame and mustard are suitable for dry climates.\n", + "\n", + "5. **Cotton**: This crop can be grown in semi-arid regions with proper irrigation management.\n", + "\n", + "6. **Cactus and Succulents**: While not traditional crops, these plants are increasingly being explored for their potential in arid agriculture.\n", + "\n", + "These crops are chosen for their ability to withstand water scarcity and their adaptability to the challenging conditions of dry regions.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# General template for your chat with pdf\n", + "##### change the promts as per requirements" + ], + "metadata": { + "id": "1dpAeJtLnL4D" + } + }, + { + "cell_type": "code", + "source": [ + "# gradio app\n", + "\n", + "import gradio as gr\n", + "from lancedb.rerankers import LinearCombinationReranker\n", + "from langchain_community.document_loaders import PyPDFLoader\n", + "from langchain_community.vectorstores import LanceDB\n", + "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "from langchain_text_splitters import CharacterTextSplitter\n", + "from langchain_core.prompts import PromptTemplate\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain.memory import ConversationBufferMemory\n", + "\n", + "\n", + "class QueryProcessor:\n", + " def __init__(self, file_path, db_url=\"lancedb_temp\", table_name=\"lancedb_indic\"):\n", + " loader = PyPDFLoader(file_path)\n", + " documents = loader.load()\n", + " text_splitter = CharacterTextSplitter()\n", + " self.documents = text_splitter.split_documents(documents)\n", + "\n", + " embeddings = OpenAIEmbeddings()\n", + " self.vector_store = LanceDB(\n", + " uri=db_url, embedding=embeddings, table_name=table_name\n", + " )\n", + "\n", + " self.reranker = LinearCombinationReranker(weight=0.3)\n", + " self.docsearch = LanceDB.from_documents(\n", + " self.documents, embeddings, reranker=self.reranker\n", + " )\n", + "\n", + " self.llm = ChatOpenAI(\n", + " model_name=\"gpt-4o\",\n", + " temperature=0.01,\n", + " )\n", + " self.memory = ConversationBufferMemory(memory_key=\"chat_history\")\n", + "\n", + " def generate_prompt_template(\n", + " self, main_instructions, prompt_instructions, context_name, query\n", + " ):\n", + " template = f\"\"\"{main_instructions}\n", + "\n", + " {prompt_instructions}\n", + "\n", + " {context_name}:\n", + " {{context}}\n", + "\n", + " Previous Conversations:\n", + " {{chat_history}}\n", + " Human: {query}\n", + " Chatbot:\"\"\"\n", + " return PromptTemplate(\n", + " template=template, input_variables=[\"context\", \"chat_history\"]\n", + " )\n", + "\n", + " def get_answer(self, query):\n", + " docs = self.docsearch.similarity_search_with_relevance_scores(query)\n", + "\n", + " prompt = self.generate_prompt_template(\n", + " main_instructions=\"Act as a knowledgeable assistant. Answer the query comprehensively and concisely based on the provided content.\",\n", + " prompt_instructions=(\n", + " \"Focus on extracting the most relevant and accurate information from the context. \"\n", + " \"Prioritize clarity, conciseness, and detail in your response. \"\n", + " \"When summarizing, ensure key points are highlighted without losing important nuances. \"\n", + " \"If the context is insufficient to fully address the query, acknowledge the limitation clearly.\"\n", + " ),\n", + " context_name=\"Search Results\",\n", + " query=query,\n", + " )\n", + "\n", + " chain = prompt | self.llm | StrOutputParser()\n", + "\n", + " answer = chain.invoke({\"context\": docs, \"chat_history\": self.memory})\n", + " return answer\n", + "\n", + "\n", + "def initialize_processor(pdf_file):\n", + " global query_processor\n", + " query_processor = QueryProcessor(pdf_file.name)\n", + " return \"PDF successfully loaded and processed. You can now ask questions.\"\n", + "\n", + "\n", + "def query_processor_fn(question):\n", + " global query_processor\n", + " if query_processor is None:\n", + " return \"Please upload a PDF first.\"\n", + " return query_processor.get_answer(question)\n", + "\n", + "\n", + "query_processor = None\n", + "\n", + "# Define Gradio interface\n", + "with gr.Blocks() as app:\n", + " gr.Markdown(\"# RAG On PDF - FarmersGPT\")\n", + "\n", + " with gr.Row():\n", + " pdf_upload = gr.File(label=\"Upload PDF\", file_types=[\".pdf\"])\n", + " pdf_status = gr.Textbox(label=\"Status\", interactive=False)\n", + "\n", + " load_pdf_btn = gr.Button(\"Load PDF\")\n", + "\n", + " with gr.Row():\n", + " user_query = gr.Textbox(\n", + " label=\"Ask a question\", placeholder=\"Enter your question here...\"\n", + " )\n", + " answer_box = gr.Textbox(label=\"Answer\", interactive=False)\n", + "\n", + " ask_question_btn = gr.Button(\"Get Answer\")\n", + "\n", + " load_pdf_btn.click(initialize_processor, inputs=[pdf_upload], outputs=[pdf_status])\n", + " ask_question_btn.click(\n", + " query_processor_fn, inputs=[user_query], outputs=[answer_box]\n", + " )\n", + "\n", + "app.launch(debug=True, share=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 611 }, + "id": "RVc-wh3ld_Ga", + "outputId": "05b0d7de-e095-4d19-a836-63edfa715af6" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "# General template for your chat with pdf\n", - "##### change the promts as per requirements" - ], - "metadata": { - "id": "1dpAeJtLnL4D" - } + "output_type": "stream", + "name": "stdout", + "text": [ + "Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().\n", + "* Running on public URL: https://6e972183d1c4f70073.gradio.live\n", + "\n", + "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n" + ] }, { - "cell_type": "code", - "source": [ - "#gradio app\n", - "\n", - "import gradio as gr\n", - "from lancedb.rerankers import LinearCombinationReranker\n", - "from langchain_community.document_loaders import PyPDFLoader\n", - "from langchain_community.vectorstores import LanceDB\n", - "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", - "from langchain_text_splitters import CharacterTextSplitter\n", - "from langchain_core.prompts import PromptTemplate\n", - "from langchain_core.output_parsers import StrOutputParser\n", - "from langchain.memory import ConversationBufferMemory\n", - "\n", - "class QueryProcessor:\n", - " def __init__(self, file_path, db_url=\"lancedb_temp\", table_name=\"lancedb_indic\"):\n", - " loader = PyPDFLoader(file_path)\n", - " documents = loader.load()\n", - " text_splitter = CharacterTextSplitter()\n", - " self.documents = text_splitter.split_documents(documents)\n", - "\n", - " embeddings = OpenAIEmbeddings()\n", - " self.vector_store = LanceDB(\n", - " uri=db_url,\n", - " embedding=embeddings,\n", - " table_name=table_name\n", - " )\n", - "\n", - " self.reranker = LinearCombinationReranker(weight=0.3)\n", - " self.docsearch = LanceDB.from_documents(\n", - " self.documents, embeddings, reranker=self.reranker\n", - " )\n", - "\n", - " self.llm = ChatOpenAI(\n", - " model_name=\"gpt-4o\",\n", - " temperature=0.01,\n", - " )\n", - " self.memory = ConversationBufferMemory(memory_key=\"chat_history\")\n", - "\n", - " def generate_prompt_template(self, main_instructions, prompt_instructions, context_name, query):\n", - " template = f\"\"\"{main_instructions}\n", - "\n", - " {prompt_instructions}\n", - "\n", - " {context_name}:\n", - " {{context}}\n", - "\n", - " Previous Conversations:\n", - " {{chat_history}}\n", - " Human: {query}\n", - " Chatbot:\"\"\"\n", - " return PromptTemplate(template=template, input_variables=[\"context\", \"chat_history\"])\n", - "\n", - " def get_answer(self, query):\n", - " docs = self.docsearch.similarity_search_with_relevance_scores(query)\n", - "\n", - " prompt = self.generate_prompt_template(\n", - " main_instructions=\"Act as a knowledgeable assistant. Answer the query comprehensively and concisely based on the provided content.\",\n", - " prompt_instructions=(\n", - " \"Focus on extracting the most relevant and accurate information from the context. \"\n", - " \"Prioritize clarity, conciseness, and detail in your response. \"\n", - " \"When summarizing, ensure key points are highlighted without losing important nuances. \"\n", - " \"If the context is insufficient to fully address the query, acknowledge the limitation clearly.\"\n", - " ),\n", - " context_name=\"Search Results\",\n", - " query=query,\n", - " )\n", - "\n", - " chain = prompt | self.llm | StrOutputParser()\n", - "\n", - " answer = chain.invoke({\"context\": docs, \"chat_history\": self.memory})\n", - " return answer\n", - "\n", - "def initialize_processor(pdf_file):\n", - " global query_processor\n", - " query_processor = QueryProcessor(pdf_file.name)\n", - " return \"PDF successfully loaded and processed. You can now ask questions.\"\n", - "\n", - "def query_processor_fn(question):\n", - " global query_processor\n", - " if query_processor is None:\n", - " return \"Please upload a PDF first.\"\n", - " return query_processor.get_answer(question)\n", - "\n", - "query_processor = None\n", - "\n", - "# Define Gradio interface\n", - "with gr.Blocks() as app:\n", - " gr.Markdown(\"# RAG On PDF - FarmersGPT\")\n", - "\n", - " with gr.Row():\n", - " pdf_upload = gr.File(label=\"Upload PDF\", file_types=[\".pdf\"])\n", - " pdf_status = gr.Textbox(label=\"Status\", interactive=False)\n", - "\n", - " load_pdf_btn = gr.Button(\"Load PDF\")\n", - "\n", - " with gr.Row():\n", - " user_query = gr.Textbox(label=\"Ask a question\", placeholder=\"Enter your question here...\")\n", - " answer_box = gr.Textbox(label=\"Answer\", interactive=False)\n", - "\n", - " ask_question_btn = gr.Button(\"Get Answer\")\n", - "\n", - " load_pdf_btn.click(initialize_processor, inputs=[pdf_upload], outputs=[pdf_status])\n", - " ask_question_btn.click(query_processor_fn, inputs=[user_query], outputs=[answer_box])\n", - "\n", - "app.launch(debug=True,share=True)\n" + "output_type": "display_data", + "data": { + "text/plain": [ + "" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 611 - }, - "id": "RVc-wh3ld_Ga", - "outputId": "05b0d7de-e095-4d19-a836-63edfa715af6" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().\n", - "* Running on public URL: https://6e972183d1c4f70073.gradio.live\n", - "\n", - "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "
" - ] - }, - "metadata": {} - } + "text/html": [ + "
" ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "lWPuyELzxiRq" - }, - "execution_count": null, - "outputs": [] + }, + "metadata": {} } - ] + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "lWPuyELzxiRq" + }, + "execution_count": null, + "outputs": [] + } + ] } diff --git a/examples/congee-RAG/README.md b/examples/congee-RAG/README.md new file mode 100644 index 0000000..1bb7fa9 --- /dev/null +++ b/examples/congee-RAG/README.md @@ -0,0 +1,86 @@ +# Cognee - Get Started + +## Let's talk about the problem first + +### Large Language Models (LLMs) have become powerful tools for generating text and answering questions, but they still have several limitations and challenges. Below is an overview of some of the biggest problems with the results they produce: + +### 1. Hallucinations and Misinformation +- Hallucinations: LLMs sometimes produce outputs that are factually incorrect or entirely fabricated. This phenomenon is known as "hallucination." Even if an LLM seems confident, the information it provides might not be reliable. +- Misinformation: Misinformation can be subtle or glaring, ranging from minor inaccuracies to entirely fictitious events, sources, or data. + +### 2. Lack of Contextual Understanding +- LLMs can recognize and replicate patterns in language but don’t have true comprehension. This can lead to responses that are coherent but miss nuanced context or deeper meaning. +- They can misinterpret multi-turn conversations, leading to confusion in maintaining context over a long dialogue. + +### 3. Inconsistent Reliability +- Depending on the prompt, LLMs might produce inconsistent responses to similar questions or tasks. For example, the same query might result in conflicting answers when asked in slightly different ways. +- This inconsistency can undermine trust in the model's outputs, especially in professional or academic settings. + +### 4. Inability to Access Real-Time Information +- Most LLMs are trained on data up to a specific point and cannot access or generate information on current events or emerging trends unless updated. This can make them unsuitable for inquiries requiring up-to-date information. +- Real-time browsing capabilities can help, but they are not universally available. + +### 5. Lack of Personalization and Adaptability +- LLMs do not naturally adapt to individual preferences or learning styles unless explicitly programmed to do so. This limits their usefulness in providing personalized recommendations or support. + +### 6. Difficulty with Highly Technical or Niche Domains +- LLMs may struggle with highly specialized or technical topics where domain-specific knowledge is required. +- They can produce technically plausible but inaccurate or incomplete information, which can be misleading in areas like law, medicine, or scientific research. + +### 7. Ambiguity in Response Generation +- LLMs might not always specify their level of certainty, making it hard to gauge when they are speculating or providing less confident answers. +- They lack a mechanism to say “I don’t know,” which can lead to responses that are less useful or potentially misleading. + +## The next solution was RAGs + +RAGs (Retrieval Augmented Generation) are systems that connect to a vector store and search for similar data so they can enrich LLM response. + +![rag.png]() + + +The problem lies in the nature of the search. If you just find some keywords, and return one or many documents from vectorstore this way, you will have an issue with the the way you would use to organise and prioritise documents. + + +![rag_problem_v2_white.drawio.png]() + + +## Semantic similarity search is not magic +The most similar result isn't the most relevant one. +If you search for documents in which the sentiment expressed is "I like apples.", one of the closest results you get are documents in which the sentiment expressed is "I don't like apples." +Wouldn't it be nice to have a semantic model LLMs could use? + +## That is where Cognee comes in +Cognee assists developers in introducing greater predictability and management into their Retrieval-Augmented Generation (RAG) workflows through the use of graph architectures, vector stores, and auto-optimizing pipelines. Displaying information as a graph is the clearest way to grasp the content of your documents. Crucially, graphs allow systematic navigation and extraction of data from documents based on their hierarchy. + +Cognee lets you create tasks and contextual pipelines of tasks that enable composable GraphRAG, where you have full control of all the elements of the pipeline from ingestion until graph creation. + + +## Core Concepts +Most of the data we provide to a system can be categorized as unstructured, semi-structured, or structured. Rows from a database would belong to structured data, jsons to semi-structured data, and logs that we input into the system could be considered unstructured. To organize and process this data, we need to ensure we have custom loaders for all data types, which can help us unify and organize it properly. + +![image.png]() + + +In the example above, we have a pipeline in which data has been imported from various sources, normalized, and stored in a database. + +## Concept 2: Data Enrichment with LLMs +LLMs are adept at processing unstructured data. They can easily extract summaries, keywords, and other useful information from documents. We use function calling with Pydantic models to extract information from the unstructured data. + +![image.png]() + + +## Concept 3: Graphs +Knowledge graphs simply map out knowledge, linking specific facts and their connections. When Large Language Models (LLMs) process text, they infer these links, leading to occasional inaccuracies due to their probabilistic nature. Clearly defined relationships enhance their accuracy. This structured approach can extend beyond concepts to document layouts, pages, or other organizational schemas. + +![Untitled-2024-10-08-1656(2).png]() + +## Concept 4: Vector and Graph Retrieval +Cognee lets you use multiple vector and graph retrieval methods to find the most relevant information. + +## Concept 5: Auto-Optimizing Pipelines +Integrating knowledge graphs into Retrieval-Augmented Generation (RAG) pipelines leads to an intriguing outcome: the system's adeptness at contextual understanding allows it to be evaluated in a way Machine Learning (ML) engineers are accustomed to. This involves bombarding the RAG system with hundreds of synthetic questions, enabling the knowledge graph to evolve and refine its context autonomously over time. This method paves the way for developing self-improving memory engines that can adapt to new data and user feedback. + + +## Below is a diagram of the cognee process for the data used in this example + +![cognee_final.drawio.png]() \ No newline at end of file diff --git a/examples/congee-RAG/cognee_demo.ipynb b/examples/congee-RAG/cognee_demo.ipynb new file mode 100644 index 0000000..01fb7d6 --- /dev/null +++ b/examples/congee-RAG/cognee_demo.ipynb @@ -0,0 +1,1045 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d35ac8ce-0f92-46f5-9ba4-a46970f0ce19", + "metadata": { + "id": "d35ac8ce-0f92-46f5-9ba4-a46970f0ce19" + }, + "source": [ + "# Cognee - Get Started" + ] + }, + { + "cell_type": "markdown", + "id": "bd981778-0c84-4542-8e6f-1a7712184873", + "metadata": { + "editable": true, + "tags": [], + "id": "bd981778-0c84-4542-8e6f-1a7712184873" + }, + "source": [ + "## Let's talk about the problem first\n", + "\n", + "### Large Language Models (LLMs) have become powerful tools for generating text and answering questions, but they still have several limitations and challenges. Below is an overview of some of the biggest problems with the results they produce:\n", + "\n", + "### 1. Hallucinations and Misinformation\n", + "- Hallucinations: LLMs sometimes produce outputs that are factually incorrect or entirely fabricated. This phenomenon is known as \"hallucination.\" Even if an LLM seems confident, the information it provides might not be reliable.\n", + "- Misinformation: Misinformation can be subtle or glaring, ranging from minor inaccuracies to entirely fictitious events, sources, or data.\n", + "\n", + "### 2. Lack of Contextual Understanding\n", + "- LLMs can recognize and replicate patterns in language but don’t have true comprehension. This can lead to responses that are coherent but miss nuanced context or deeper meaning.\n", + "- They can misinterpret multi-turn conversations, leading to confusion in maintaining context over a long dialogue.\n", + "\n", + "### 3. Inconsistent Reliability\n", + "- Depending on the prompt, LLMs might produce inconsistent responses to similar questions or tasks. For example, the same query might result in conflicting answers when asked in slightly different ways.\n", + "- This inconsistency can undermine trust in the model's outputs, especially in professional or academic settings.\n", + "\n", + "### 4. Inability to Access Real-Time Information\n", + "- Most LLMs are trained on data up to a specific point and cannot access or generate information on current events or emerging trends unless updated. This can make them unsuitable for inquiries requiring up-to-date information.\n", + "- Real-time browsing capabilities can help, but they are not universally available.\n", + "\n", + "### 5. Lack of Personalization and Adaptability\n", + "- LLMs do not naturally adapt to individual preferences or learning styles unless explicitly programmed to do so. This limits their usefulness in providing personalized recommendations or support.\n", + "\n", + "### 6. Difficulty with Highly Technical or Niche Domains\n", + "- LLMs may struggle with highly specialized or technical topics where domain-specific knowledge is required.\n", + "- They can produce technically plausible but inaccurate or incomplete information, which can be misleading in areas like law, medicine, or scientific research.\n", + "\n", + "### 7. Ambiguity in Response Generation\n", + "- LLMs might not always specify their level of certainty, making it hard to gauge when they are speculating or providing less confident answers.\n", + "- They lack a mechanism to say “I don’t know,” which can lead to responses that are less useful or potentially misleading." + ] + }, + { + "cell_type": "markdown", + "id": "d8e606b1-94d3-43ce-bb4b-dbadff7f4ca6", + "metadata": { + "id": "d8e606b1-94d3-43ce-bb4b-dbadff7f4ca6" + }, + "source": [ + "## The next solution was RAGs\n", + "\n", + "#### RAGs (Retrieval Augmented Generation) are systems that connect to a vector store and search for similar data so they can enrich LLM response." + ] + }, + { + "cell_type": "markdown", + "id": "23e74f22-f43c-4f03-afe0-b423cbaa412a", + "metadata": { + "id": "23e74f22-f43c-4f03-afe0-b423cbaa412a" + }, + "source": [ + "![rag.png]()" + ] + }, + { + "cell_type": "markdown", + "id": "b6a98710-a14b-4a14-bb56-d3ae055e94d9", + "metadata": { + "id": "b6a98710-a14b-4a14-bb56-d3ae055e94d9" + }, + "source": [ + "#### The problem lies in the nature of the search. If you just find some keywords, and return one or many documents from vectorstore this way, you will have an issue with the the way you would use to organise and prioritise documents.\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "![rag_problem_v2_white.drawio.png]()" + ], + "metadata": { + "id": "RQfw9o-Ege5S" + }, + "id": "RQfw9o-Ege5S" + }, + { + "cell_type": "markdown", + "id": "d3406b8f-cb9a-46ce-9029-2cbae2755795", + "metadata": { + "id": "d3406b8f-cb9a-46ce-9029-2cbae2755795" + }, + "source": [ + "## Semantic similarity search is not magic\n", + "#### The most similar result isn't the most relevant one.\n", + "#### If you search for documents in which the sentiment expressed is \"I like apples.\", one of the closest results you get are documents in which the sentiment expressed is \"I don't like apples.\"\n", + "#### Wouldn't it be nice to have a semantic model LLMs could use?\n" + ] + }, + { + "cell_type": "markdown", + "id": "b900f830-8e9e-4272-b198-594606da4457", + "metadata": { + "id": "b900f830-8e9e-4272-b198-594606da4457" + }, + "source": [ + "# That is where Cognee comes in" + ] + }, + { + "cell_type": "markdown", + "id": "d3ae099a-1bbb-4f13-9bcb-c0f778d50e91", + "metadata": { + "id": "d3ae099a-1bbb-4f13-9bcb-c0f778d50e91" + }, + "source": [ + "#### Cognee assists developers in introducing greater predictability and management into their Retrieval-Augmented Generation (RAG) workflows through the use of graph architectures, vector stores, and auto-optimizing pipelines. Displaying information as a graph is the clearest way to grasp the content of your documents. Crucially, graphs allow systematic navigation and extraction of data from documents based on their hierarchy.\n", + "\n", + "#### Cognee lets you create tasks and contextual pipelines of tasks that enable composable GraphRAG, where you have full control of all the elements of the pipeline from ingestion until graph creation." + ] + }, + { + "cell_type": "markdown", + "id": "785383b0-87b5-4a0a-be3f-e809aa284e30", + "metadata": { + "id": "785383b0-87b5-4a0a-be3f-e809aa284e30" + }, + "source": [ + "# Core Concepts" + ] + }, + { + "cell_type": "markdown", + "id": "3540ce30-2b22-4ece-8516-8d5ff2a405fe", + "metadata": { + "id": "3540ce30-2b22-4ece-8516-8d5ff2a405fe" + }, + "source": [ + "### Most of the data we provide to a system can be categorized as unstructured, semi-structured, or structured. Rows from a database would belong to structured data, jsons to semi-structured data, and logs that we input into the system could be considered unstructured. To organize and process this data, we need to ensure we have custom loaders for all data types, which can help us unify and organize it properly." + ] + }, + { + "cell_type": "markdown", + "id": "fe0bfa57-dca7-40aa-9ead-c6852b155878", + "metadata": { + "id": "fe0bfa57-dca7-40aa-9ead-c6852b155878" + }, + "source": [ + "![image.png]()" + ] + }, + { + "cell_type": "markdown", + "id": "7e47bae4-d27d-4430-a134-e1b381378f5c", + "metadata": { + "id": "7e47bae4-d27d-4430-a134-e1b381378f5c" + }, + "source": [ + "#### In the example above, we have a pipeline in which data has been imported from various sources, normalized, and stored in a database." + ] + }, + { + "cell_type": "markdown", + "id": "2f9c9376-8c68-4397-9081-d260cddcbd25", + "metadata": { + "id": "2f9c9376-8c68-4397-9081-d260cddcbd25" + }, + "source": [ + "## Concept 2: Data Enrichment with LLMs" + ] + }, + { + "cell_type": "markdown", + "source": [ + "#### LLMs are adept at processing unstructured data. They can easily extract summaries, keywords, and other useful information from documents. We use function calling with Pydantic models to extract information from the unstructured data." + ], + "metadata": { + "id": "oFMidcsB7ap6" + }, + "id": "oFMidcsB7ap6" + }, + { + "cell_type": "markdown", + "source": [ + "![image.png]()" + ], + "metadata": { + "id": "WXP8KevM7dRT" + }, + "id": "WXP8KevM7dRT" + }, + { + "cell_type": "markdown", + "source": [ + "#### We decompose the loaded content into graphs, allowing us to more precisely map out the relationships between entities and concepts." + ], + "metadata": { + "id": "A9PMdOc37rbo" + }, + "id": "A9PMdOc37rbo" + }, + { + "cell_type": "markdown", + "source": [ + "## Concept 3: Graphs" + ], + "metadata": { + "id": "VLDoIXqI7uOD" + }, + "id": "VLDoIXqI7uOD" + }, + { + "cell_type": "markdown", + "source": [ + "#### Knowledge graphs simply map out knowledge, linking specific facts and their connections. When Large Language Models (LLMs) process text, they infer these links, leading to occasional inaccuracies due to their probabilistic nature. Clearly defined relationships enhance their accuracy. This structured approach can extend beyond concepts to document layouts, pages, or other organizational schemas." + ], + "metadata": { + "id": "t1yh531L7vve" + }, + "id": "t1yh531L7vve" + }, + { + "cell_type": "markdown", + "source": [ + "![Untitled-2024-10-08-1656(2).png]()" + ], + "metadata": { + "id": "AArlpK0S7x6X" + }, + "id": "AArlpK0S7x6X" + }, + { + "cell_type": "markdown", + "source": [ + "## Concept 4: Vector and Graph Retrieval" + ], + "metadata": { + "id": "XJ-gpI6f76CD" + }, + "id": "XJ-gpI6f76CD" + }, + { + "cell_type": "markdown", + "source": [ + "#### Cognee lets you use multiple vector and graph retrieval methods to find the most relevant information." + ], + "metadata": { + "id": "tJz0QrQe7-hF" + }, + "id": "tJz0QrQe7-hF" + }, + { + "cell_type": "markdown", + "source": [ + "## Concept 5: Auto-Optimizing Pipelines" + ], + "metadata": { + "id": "BKlaAVQx8AyK" + }, + "id": "BKlaAVQx8AyK" + }, + { + "cell_type": "markdown", + "source": [ + "#### Integrating knowledge graphs into Retrieval-Augmented Generation (RAG) pipelines leads to an intriguing outcome: the system's adeptness at contextual understanding allows it to be evaluated in a way Machine Learning (ML) engineers are accustomed to. This involves bombarding the RAG system with hundreds of synthetic questions, enabling the knowledge graph to evolve and refine its context autonomously over time. This method paves the way for developing self-improving memory engines that can adapt to new data and user feedback." + ], + "metadata": { + "id": "3h0kmuL88CU4" + }, + "id": "3h0kmuL88CU4" + }, + { + "cell_type": "markdown", + "id": "074f0ea8-c659-4736-be26-be4b0e5ac665", + "metadata": { + "id": "074f0ea8-c659-4736-be26-be4b0e5ac665" + }, + "source": [ + "# Demo time" + ] + }, + { + "cell_type": "markdown", + "source": [ + "#### First we need to install all dependencies:" + ], + "metadata": { + "id": "hVVgc9KZmk3v" + }, + "id": "hVVgc9KZmk3v" + }, + { + "cell_type": "code", + "source": [ + "!pip install onnxruntime-gpu==1.17.1 --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ -qqq" + ], + "metadata": { + "id": "7ytkuIkFmeiE" + }, + "id": "7ytkuIkFmeiE", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install cognee==0.1.18" + ], + "metadata": { + "id": "cVPQTKcWmgJ0" + }, + "id": "cVPQTKcWmgJ0", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "id": "0587d91d", + "metadata": { + "id": "0587d91d" + }, + "source": [ + "#### Then let's define some data that we will cognify and perform a search on" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df16431d0f48b006", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:48.519686Z", + "start_time": "2024-09-20T14:02:48.515589Z" + }, + "id": "df16431d0f48b006" + }, + "outputs": [], + "source": [ + "job_position = \"\"\"Senior Data Scientist (Machine Learning)\n", + "\n", + "Company: TechNova Solutions\n", + "Location: San Francisco, CA\n", + "\n", + "Job Description:\n", + "\n", + "TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.\n", + "\n", + "Responsibilities:\n", + "\n", + "Develop and implement advanced machine learning algorithms and models.\n", + "Analyze large, complex datasets to extract meaningful patterns and insights.\n", + "Collaborate with cross-functional teams to integrate predictive models into products.\n", + "Stay updated with the latest advancements in machine learning and data science.\n", + "Mentor junior data scientists and provide technical guidance.\n", + "Qualifications:\n", + "\n", + "Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field.\n", + "5+ years of experience in data science and machine learning.\n", + "Proficient in Python, R, and SQL.\n", + "Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).\n", + "Strong problem-solving skills and attention to detail.\n", + "Candidate CVs\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9086abf3af077ab4", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:49.120838Z", + "start_time": "2024-09-20T14:02:49.118294Z" + }, + "id": "9086abf3af077ab4" + }, + "outputs": [], + "source": [ + "job_1 = \"\"\"\n", + "CV 1: Relevant\n", + "Name: Dr. Emily Carter\n", + "Contact Information:\n", + "\n", + "Email: emily.carter@example.com\n", + "Phone: (555) 123-4567\n", + "Summary:\n", + "\n", + "Senior Data Scientist with over 8 years of experience in machine learning and predictive analytics. Expertise in developing advanced algorithms and deploying scalable models in production environments.\n", + "\n", + "Education:\n", + "\n", + "Ph.D. in Computer Science, Stanford University (2014)\n", + "B.S. in Mathematics, University of California, Berkeley (2010)\n", + "Experience:\n", + "\n", + "Senior Data Scientist, InnovateAI Labs (2016 – Present)\n", + "Led a team in developing machine learning models for natural language processing applications.\n", + "Implemented deep learning algorithms that improved prediction accuracy by 25%.\n", + "Collaborated with cross-functional teams to integrate models into cloud-based platforms.\n", + "Data Scientist, DataWave Analytics (2014 – 2016)\n", + "Developed predictive models for customer segmentation and churn analysis.\n", + "Analyzed large datasets using Hadoop and Spark frameworks.\n", + "Skills:\n", + "\n", + "Programming Languages: Python, R, SQL\n", + "Machine Learning: TensorFlow, Keras, Scikit-Learn\n", + "Big Data Technologies: Hadoop, Spark\n", + "Data Visualization: Tableau, Matplotlib\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9de0cc07f798b7f", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:49.675003Z", + "start_time": "2024-09-20T14:02:49.671615Z" + }, + "id": "a9de0cc07f798b7f" + }, + "outputs": [], + "source": [ + "job_2 = \"\"\"\n", + "CV 2: Relevant\n", + "Name: Michael Rodriguez\n", + "Contact Information:\n", + "\n", + "Email: michael.rodriguez@example.com\n", + "Phone: (555) 234-5678\n", + "Summary:\n", + "\n", + "Data Scientist with a strong background in machine learning and statistical modeling. Skilled in handling large datasets and translating data into actionable business insights.\n", + "\n", + "Education:\n", + "\n", + "M.S. in Data Science, Carnegie Mellon University (2013)\n", + "B.S. in Computer Science, University of Michigan (2011)\n", + "Experience:\n", + "\n", + "Senior Data Scientist, Alpha Analytics (2017 – Present)\n", + "Developed machine learning models to optimize marketing strategies.\n", + "Reduced customer acquisition cost by 15% through predictive modeling.\n", + "Data Scientist, TechInsights (2013 – 2017)\n", + "Analyzed user behavior data to improve product features.\n", + "Implemented A/B testing frameworks to evaluate product changes.\n", + "Skills:\n", + "\n", + "Programming Languages: Python, Java, SQL\n", + "Machine Learning: Scikit-Learn, XGBoost\n", + "Data Visualization: Seaborn, Plotly\n", + "Databases: MySQL, MongoDB\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "185ff1c102d06111", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:50.286828Z", + "start_time": "2024-09-20T14:02:50.284369Z" + }, + "id": "185ff1c102d06111" + }, + "outputs": [], + "source": [ + "job_3 = \"\"\"\n", + "CV 3: Relevant\n", + "Name: Sarah Nguyen\n", + "Contact Information:\n", + "\n", + "Email: sarah.nguyen@example.com\n", + "Phone: (555) 345-6789\n", + "Summary:\n", + "\n", + "Data Scientist specializing in machine learning with 6 years of experience. Passionate about leveraging data to drive business solutions and improve product performance.\n", + "\n", + "Education:\n", + "\n", + "M.S. in Statistics, University of Washington (2014)\n", + "B.S. in Applied Mathematics, University of Texas at Austin (2012)\n", + "Experience:\n", + "\n", + "Data Scientist, QuantumTech (2016 – Present)\n", + "Designed and implemented machine learning algorithms for financial forecasting.\n", + "Improved model efficiency by 20% through algorithm optimization.\n", + "Junior Data Scientist, DataCore Solutions (2014 – 2016)\n", + "Assisted in developing predictive models for supply chain optimization.\n", + "Conducted data cleaning and preprocessing on large datasets.\n", + "Skills:\n", + "\n", + "Programming Languages: Python, R\n", + "Machine Learning Frameworks: PyTorch, Scikit-Learn\n", + "Statistical Analysis: SAS, SPSS\n", + "Cloud Platforms: AWS, Azure\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d55ce4c58f8efb67", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:50.950343Z", + "start_time": "2024-09-20T14:02:50.946378Z" + }, + "id": "d55ce4c58f8efb67" + }, + "outputs": [], + "source": [ + "job_4 = \"\"\"\n", + "CV 4: Not Relevant\n", + "Name: David Thompson\n", + "Contact Information:\n", + "\n", + "Email: david.thompson@example.com\n", + "Phone: (555) 456-7890\n", + "Summary:\n", + "\n", + "Creative Graphic Designer with over 8 years of experience in visual design and branding. Proficient in Adobe Creative Suite and passionate about creating compelling visuals.\n", + "\n", + "Education:\n", + "\n", + "B.F.A. in Graphic Design, Rhode Island School of Design (2012)\n", + "Experience:\n", + "\n", + "Senior Graphic Designer, CreativeWorks Agency (2015 – Present)\n", + "Led design projects for clients in various industries.\n", + "Created branding materials that increased client engagement by 30%.\n", + "Graphic Designer, Visual Innovations (2012 – 2015)\n", + "Designed marketing collateral, including brochures, logos, and websites.\n", + "Collaborated with the marketing team to develop cohesive brand strategies.\n", + "Skills:\n", + "\n", + "Design Software: Adobe Photoshop, Illustrator, InDesign\n", + "Web Design: HTML, CSS\n", + "Specialties: Branding and Identity, Typography\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca4ecc32721ad332", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:51.548191Z", + "start_time": "2024-09-20T14:02:51.545520Z" + }, + "id": "ca4ecc32721ad332" + }, + "outputs": [], + "source": [ + "job_5 = \"\"\"\n", + "CV 5: Not Relevant\n", + "Name: Jessica Miller\n", + "Contact Information:\n", + "\n", + "Email: jessica.miller@example.com\n", + "Phone: (555) 567-8901\n", + "Summary:\n", + "\n", + "Experienced Sales Manager with a strong track record in driving sales growth and building high-performing teams. Excellent communication and leadership skills.\n", + "\n", + "Education:\n", + "\n", + "B.A. in Business Administration, University of Southern California (2010)\n", + "Experience:\n", + "\n", + "Sales Manager, Global Enterprises (2015 – Present)\n", + "Managed a sales team of 15 members, achieving a 20% increase in annual revenue.\n", + "Developed sales strategies that expanded customer base by 25%.\n", + "Sales Representative, Market Leaders Inc. (2010 – 2015)\n", + "Consistently exceeded sales targets and received the 'Top Salesperson' award in 2013.\n", + "Skills:\n", + "\n", + "Sales Strategy and Planning\n", + "Team Leadership and Development\n", + "CRM Software: Salesforce, Zoho\n", + "Negotiation and Relationship Building\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "source": [ + "#### Please add the necessary environment information bellow:" + ], + "metadata": { + "id": "onKOiY1ksR30" + }, + "id": "onKOiY1ksR30" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bce39dc6", + "metadata": { + "id": "bce39dc6" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# # Setting environment variables\n", + "os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n", + "os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n", + "\n", + "os.environ[\"LLM_API_KEY\"] = \"\"\n", + "\n", + "# \"neo4j\" or \"networkx\"\n", + "os.environ[\"GRAPH_DATABASE_PROVIDER\"] = \"networkx\"\n", + "# Not needed if using networkx\n", + "# GRAPH_DATABASE_URL=\"\"\n", + "# GRAPH_DATABASE_USERNAME=\"\"\n", + "# GRAPH_DATABASE_PASSWORD=\"\"\n", + "\n", + "# \"qdrant\", \"weaviate\" or \"lancedb\"\n", + "os.environ[\"VECTOR_ENGINE_PROVIDER\"] = \"lancedb\"\n", + "# Not needed if using \"lancedb\"\n", + "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n", + "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n", + "\n", + "# Database provider \"sqlite\" or \"postgres\"\n", + "os.environ[\"DB_PROVIDER\"] = \"sqlite\"\n", + "\n", + "# Database name\n", + "os.environ[\"DB_NAME\"] = \"cognee_db\"\n", + "\n", + "# Postgres specific parameters (Only if Postgres is run)\n", + "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n", + "# os.environ[\"DB_PORT\"]=\"5432\"\n", + "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n", + "# os.environ[\"DB_PASSWORD\"]=\"cognee\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f1a1dbd", + "metadata": { + "id": "9f1a1dbd" + }, + "outputs": [], + "source": [ + "# Reset the cognee system with the following command:\n", + "\n", + "import cognee\n", + "\n", + "await cognee.prune.prune_data()\n", + "await cognee.prune.prune_system(metadata=True)" + ] + }, + { + "cell_type": "markdown", + "id": "383d6971", + "metadata": { + "id": "383d6971" + }, + "source": [ + "#### After we have defined and gathered our data let's add it to cognee" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "904df61ba484a8e5", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:54.243987Z", + "start_time": "2024-09-20T14:02:52.498195Z" + }, + "id": "904df61ba484a8e5" + }, + "outputs": [], + "source": [ + "import cognee\n", + "\n", + "await cognee.add([job_1, job_2, job_3, job_4, job_5, job_position], \"example\")" + ] + }, + { + "cell_type": "markdown", + "id": "0f15c5b1", + "metadata": { + "id": "0f15c5b1" + }, + "source": [ + "#### All good, let's cognify it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c431fdef4921ae0", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:57.925667Z", + "start_time": "2024-09-20T14:02:57.922353Z" + }, + "id": "7c431fdef4921ae0" + }, + "outputs": [], + "source": [ + "from cognee.shared.data_models import KnowledgeGraph\n", + "from cognee.modules.data.models import Dataset, Data\n", + "from cognee.modules.data.methods.get_dataset_data import get_dataset_data\n", + "from cognee.modules.cognify.config import get_cognify_config\n", + "from cognee.modules.pipelines.tasks.Task import Task\n", + "from cognee.modules.pipelines import run_tasks, run_tasks_parallel\n", + "from cognee.modules.users.models import User\n", + "from cognee.tasks.summarization import summarize_text\n", + "from cognee.tasks import (\n", + " chunk_remove_disconnected,\n", + " infer_data_ontology,\n", + " save_chunks_to_store,\n", + " chunk_update_check,\n", + " chunks_into_graph,\n", + " source_documents_to_chunks,\n", + " check_permissions_on_documents,\n", + " classify_documents,\n", + " chunk_naive_llm_classifier,\n", + ")\n", + "\n", + "\n", + "async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n", + " data_documents: list[Data] = await get_dataset_data(dataset_id=dataset.id)\n", + "\n", + " try:\n", + " root_node_id = None\n", + "\n", + " cognee_config = get_cognify_config()\n", + "\n", + " tasks = [\n", + " Task(classify_documents),\n", + " Task(check_permissions_on_documents, user=user, permissions=[\"write\"]),\n", + " Task(\n", + " infer_data_ontology,\n", + " root_node_id=root_node_id,\n", + " ontology_model=KnowledgeGraph,\n", + " ),\n", + " Task(\n", + " source_documents_to_chunks, chunk_size=800, parent_node_id=root_node_id\n", + " ), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n", + " Task(\n", + " chunks_into_graph,\n", + " graph_model=KnowledgeGraph,\n", + " collection_name=\"entities\",\n", + " task_config={\"batch_size\": 10},\n", + " ), # Generate knowledge graphs from the document chunks and attach it to chunk nodes\n", + " Task(\n", + " chunk_update_check, collection_name=\"chunks\"\n", + " ), # Find all affected chunks, so we don't process unchanged chunks\n", + " Task(\n", + " save_chunks_to_store,\n", + " collection_name=\"chunks\",\n", + " ),\n", + " Task(\n", + " summarize_text,\n", + " summarization_model=cognee_config.summarization_model,\n", + " collection_name=\"summaries\",\n", + " ),\n", + " Task(\n", + " chunk_naive_llm_classifier,\n", + " classification_model=cognee_config.classification_model,\n", + " ),\n", + " Task(chunk_remove_disconnected), # Remove the obsolete document chunks.\n", + " ]\n", + "\n", + " pipeline = run_tasks(tasks, data_documents)\n", + "\n", + " async for result in pipeline:\n", + " print(result)\n", + " except Exception as error:\n", + " raise error" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0a91b99c6215e09", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-20T14:02:58.905774Z", + "start_time": "2024-09-20T14:02:58.625915Z" + }, + "id": "f0a91b99c6215e09" + }, + "outputs": [], + "source": [ + "from cognee.modules.users.methods import get_default_user\n", + "from cognee.modules.data.methods import get_datasets_by_name\n", + "\n", + "user = await get_default_user()\n", + "\n", + "datasets = await get_datasets_by_name([\"example\"], user.id)\n", + "\n", + "await run_cognify_pipeline(datasets[0], user)" + ] + }, + { + "cell_type": "markdown", + "id": "219a6d41", + "metadata": { + "id": "219a6d41" + }, + "source": [ + "#### We get the url to the graph on graphistry in the notebook cell bellow, showing nodes and connections made by the cognify process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "080389e5", + "metadata": { + "id": "080389e5" + }, + "outputs": [], + "source": [ + "import os\n", + "from cognee.shared.utils import render_graph\n", + "from cognee.infrastructure.databases.graph import get_graph_engine\n", + "import graphistry\n", + "\n", + "graphistry.login(\n", + " username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\")\n", + ")\n", + "\n", + "graph_engine = await get_graph_engine()\n", + "\n", + "graph_url = await render_graph(graph_engine.graph)\n", + "print(graph_url)" + ] + }, + { + "cell_type": "markdown", + "id": "59e6c3c3", + "metadata": { + "id": "59e6c3c3" + }, + "source": [ + "#### We can also do a search on the data to explore the knowledge." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5e7dfc8", + "metadata": { + "id": "e5e7dfc8" + }, + "outputs": [], + "source": [ + "async def search(\n", + " vector_engine,\n", + " collection_name: str,\n", + " query_text: str = None,\n", + "):\n", + " query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]\n", + "\n", + " connection = await vector_engine.get_connection()\n", + " collection = await connection.open_table(collection_name)\n", + "\n", + " results = await collection.vector_search(query_vector).limit(10).to_pandas()\n", + "\n", + " result_values = list(results.to_dict(\"index\").values())\n", + "\n", + " return [\n", + " dict(\n", + " id=str(result[\"id\"]),\n", + " payload=result[\"payload\"],\n", + " score=result[\"_distance\"],\n", + " )\n", + " for result in result_values\n", + " ]\n", + "\n", + "\n", + "from cognee.infrastructure.databases.vector import get_vector_engine\n", + "\n", + "vector_engine = get_vector_engine()\n", + "results = await search(vector_engine, \"entities\", \"sarah.nguyen@example.com\")\n", + "for result in results:\n", + " print(result)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "#### We normalize search output scores so the lower the score of the search result is the higher the chance that it's what you're looking for. In the example above we have searched for node entities in the knowledge graph related to \"sarah.nguyen@example.com\"" + ], + "metadata": { + "id": "F4s9pJyqhgtP" + }, + "id": "F4s9pJyqhgtP" + }, + { + "cell_type": "markdown", + "source": [ + "#### In the example bellow we'll use cognee search to summarize information regarding the node most related to \"sarah.nguyen@example.com\" in the knowledge graph" + ], + "metadata": { + "id": "v3KZN1J38g9c" + }, + "id": "v3KZN1J38g9c" + }, + { + "cell_type": "code", + "source": [ + "from cognee.api.v1.search import SearchType\n", + "\n", + "node = (await vector_engine.search(\"entities\", \"sarah.nguyen@example.com\"))[0]\n", + "node_name = node.payload[\"name\"]\n", + "\n", + "search_results = await cognee.search(SearchType.SUMMARIES, query=node_name)\n", + "print(\"\\n\\nExtracted summaries are:\\n\")\n", + "for result in search_results:\n", + " print(f\"{result}\\n\")" + ], + "metadata": { + "id": "o9Cdt1IF8jjH" + }, + "id": "o9Cdt1IF8jjH", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### In this example we'll use cognee search to find chunks in which the node most related to \"sarah.nguyen@example.com\" is a part of" + ], + "metadata": { + "id": "JiqylD3K8n3_" + }, + "id": "JiqylD3K8n3_" + }, + { + "cell_type": "code", + "source": [ + "search_results = await cognee.search(SearchType.CHUNKS, query=node_name)\n", + "print(\"\\n\\nExtracted chunks are:\\n\")\n", + "for result in search_results:\n", + " print(f\"{result}\\n\")" + ], + "metadata": { + "id": "j54MkQxQ8nBg" + }, + "id": "j54MkQxQ8nBg", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### In this example we'll use cognee search to give us insights from the knowledge graph related to the node most related to \"sarah.nguyen@example.com\"" + ], + "metadata": { + "id": "zBeVLjubFrOI" + }, + "id": "zBeVLjubFrOI" + }, + { + "cell_type": "code", + "source": [ + "search_results = await cognee.search(SearchType.INSIGHTS, query=node_name)\n", + "print(\"\\n\\nExtracted insights are:\\n\")\n", + "for result in search_results:\n", + " print(f\"{result}\\n\")" + ], + "metadata": { + "id": "0FSaecZ-FrzF" + }, + "id": "0FSaecZ-FrzF", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### Bellow is a diagram of the cognee process for the data used in this example notebook" + ], + "metadata": { + "id": "4W1W_Om880Db" + }, + "id": "4W1W_Om880Db" + }, + { + "cell_type": "markdown", + "source": [ + "![cognee_final.drawio.png]()" + ], + "metadata": { + "id": "2gpysOFT816c" + }, + "id": "2gpysOFT816c" + }, + { + "cell_type": "markdown", + "id": "288ab570", + "metadata": { + "id": "288ab570" + }, + "source": [ + "## Give us a star if you like it!\n", + "https://github.com/topoteretes/cognee" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cognee-bGi0WgSG-py3.9", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/tutorials/cli-sdk-to-convert-image-datasets-to-lance/convert-any-image-dataset-to-lance.py b/tutorials/cli-sdk-to-convert-image-datasets-to-lance/convert-any-image-dataset-to-lance.py index 10cec49..be2a853 100644 --- a/tutorials/cli-sdk-to-convert-image-datasets-to-lance/convert-any-image-dataset-to-lance.py +++ b/tutorials/cli-sdk-to-convert-image-datasets-to-lance/convert-any-image-dataset-to-lance.py @@ -6,20 +6,22 @@ import time from tqdm import tqdm -def process_images(images_folder, split, schema): +def process_images(images_folder, split, schema): # Iterate over the categories within each data type label_folder = os.path.join(images_folder, split) for label in os.listdir(label_folder): label_folder = os.path.join(images_folder, split, label) - + # Iterate over the images within each label - for filename in tqdm(os.listdir(label_folder), desc=f"Processing {split} - {label}"): + for filename in tqdm( + os.listdir(label_folder), desc=f"Processing {split} - {label}" + ): # Construct the full path to the image image_path = os.path.join(label_folder, filename) # Read and convert the image to a binary format - with open(image_path, 'rb') as f: + with open(image_path, "rb") as f: binary_data = f.read() image_array = pa.array([binary_data], type=pa.binary()) @@ -29,69 +31,84 @@ def process_images(images_folder, split, schema): # Yield RecordBatch for each image yield pa.RecordBatch.from_arrays( - [image_array, filename_array, label_array, split_array], - schema=schema + [image_array, filename_array, label_array, split_array], schema=schema ) + # Function to write PyArrow Table to Lance dataset def write_to_lance(images_folder, dataset_name, schema): - for split in ['train', 'test', 'val']: + for split in ["train", "test", "val"]: lance_file_path = os.path.join(images_folder, f"{dataset_name}_{split}.lance") - - reader = pa.RecordBatchReader.from_batches(schema, process_images(images_folder, split, schema)) + + reader = pa.RecordBatchReader.from_batches( + schema, process_images(images_folder, split, schema) + ) lance.write_dataset( reader, lance_file_path, schema, ) + def loading_into_pandas(images_folder, dataset_name): data_frames = {} # Dictionary to store DataFrames for each data type - + batch_size = args.batch_size - for split in ['test', 'train', 'val']: + for split in ["test", "train", "val"]: uri = os.path.join(images_folder, f"{dataset_name}_{split}.lance") ds = lance.dataset(uri) # Accumulate data from batches into a list data = [] - for batch in tqdm(ds.to_batches(columns=["image", "filename", "label", "split"], batch_size=batch_size), desc=f"Loading {split} batches"): + for batch in tqdm( + ds.to_batches( + columns=["image", "filename", "label", "split"], batch_size=batch_size + ), + desc=f"Loading {split} batches", + ): tbl = batch.to_pandas() data.append(tbl) # Concatenate all DataFrames into a single DataFrame df = pd.concat(data, ignore_index=True) - + # Store the DataFrame in the dictionary data_frames[split] = df - + print(f"Pandas DataFrame for {split} is ready") print("Total Rows: ", df.shape[0]) - + return data_frames + if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Process image dataset.') - parser.add_argument('--batch_size', type=int, default=10, help='Batch size for processing images') - parser.add_argument('--dataset', type=str, help='Path to the image dataset folder') - + parser = argparse.ArgumentParser(description="Process image dataset.") + parser.add_argument( + "--batch_size", type=int, default=10, help="Batch size for processing images" + ) + parser.add_argument("--dataset", type=str, help="Path to the image dataset folder") + try: args = parser.parse_args() dataset_path = args.dataset if dataset_path is None: - raise ValueError("Please provide the path to the image dataset folder using the --dataset argument.") - + raise ValueError( + "Please provide the path to the image dataset folder using the --dataset argument." + ) + # Extract dataset name dataset_name = os.path.basename(dataset_path) start = time.time() - schema = pa.schema([ - pa.field("image", pa.binary()), - pa.field("filename", pa.string()), - pa.field("label", pa.string()), - pa.field("split", pa.string()) - ]) + schema = pa.schema( + [ + pa.field("image", pa.binary()), + pa.field("filename", pa.string()), + pa.field("label", pa.string()), + pa.field("split", pa.string()), + ] + ) write_to_lance(dataset_path, dataset_name, schema) data_frames = loading_into_pandas(dataset_path, dataset_name) end = time.time() @@ -100,7 +117,9 @@ def loading_into_pandas(images_folder, dataset_name): except ValueError as e: print(e) print("Example:") - print("python3 convert-any-image-dataset-to-lance.py --batch_size 10 --dataset image_dataset_folder") + print( + "python3 convert-any-image-dataset-to-lance.py --batch_size 10 --dataset image_dataset_folder" + ) exit(1) except FileNotFoundError: diff --git a/tutorials/cli-sdk-to-convert-image-datasets-to-lance/main.ipynb b/tutorials/cli-sdk-to-convert-image-datasets-to-lance/main.ipynb index eabdd1e..a5be18c 100644 --- a/tutorials/cli-sdk-to-convert-image-datasets-to-lance/main.ipynb +++ b/tutorials/cli-sdk-to-convert-image-datasets-to-lance/main.ipynb @@ -1,232 +1,246 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "UByKm8Q6dCEB" - }, - "source": [ - "### Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "J8jICxVldCEC" - }, - "outputs": [], - "source": [ - "import os\n", - "import pandas as pd\n", - "import pyarrow as pa\n", - "import lance\n", - "import time\n", - "from tqdm import tqdm\n", - "\n", - "import warnings\n", - "warnings.simplefilter('ignore')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AfUoIr4TdCEF" - }, - "source": [ - "### Set the variable according to your Image dataset\n", - "\n", - "Assign the path to your image dataset to the variable `image_dataset`. This dataset should contain your images organized into training, testing, and validation folders. These images will be used to convert them into Lance format.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5JvY3ucxdCEF" - }, - "outputs": [], - "source": [ - "image_dataset = \"image_dataset\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4ULduqF2dCEG" - }, - "source": [ - "### Processing the Images\n", - "\n", - "The `process_images` function is the central component of this notebook, responsible for transforming images from the training, testing, and validation folders into Lance format. This format typically includes essential attributes such as `image`, `filename`, `category`, and `data_type`.\n", - "\n", - "Specifically, `image` represents the actual image data, `filename` denotes the name of the file, `category` indicates the category to which the image belongs, and `data_type` specifies whether the image is from the training, testing, or validation set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "S1wX3JVmdCEG" - }, - "outputs": [], - "source": [ - "def process_images():\n", - " # Get the current directory path\n", - " current_dir = os.getcwd()\n", - " images_folder = os.path.join(current_dir, image_dataset)\n", - " print(images_folder)\n", - "\n", - " # Define schema for RecordBatch\n", - " schema = pa.schema([('image', pa.binary()),\n", - " ('filename', pa.string()),\n", - " ('category', pa.string()),\n", - " ('data_type', pa.string())])\n", - "\n", - " # Iterate over the data types (train, test, valid)\n", - " for data_type in ['train', 'test', 'val']:\n", - " data_type_folder = os.path.join(images_folder, data_type)\n", - "\n", - " # Iterate over the categories within each data type\n", - " for category in os.listdir(data_type_folder):\n", - " category_folder = os.path.join(data_type_folder, category)\n", - "\n", - " # Iterate over the images within each category\n", - " for filename in tqdm(os.listdir(category_folder), desc=f\"Processing {data_type} - {category}\"):\n", - " # Construct the full path to the image\n", - " image_path = os.path.join(category_folder, filename)\n", - "\n", - " # Read and convert the image to a binary format\n", - " with open(image_path, 'rb') as f:\n", - " binary_data = f.read()\n", - "\n", - " image_array = pa.array([binary_data], type=pa.binary())\n", - " filename_array = pa.array([filename], type=pa.string())\n", - " category_array = pa.array([category], type=pa.string())\n", - " data_type_array = pa.array([data_type], type=pa.string())\n", - "\n", - " # Yield RecordBatch for each image\n", - " yield pa.RecordBatch.from_arrays(\n", - " [image_array, filename_array, category_array, data_type_array],\n", - " schema=schema\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Gcc4JvAYdCEI" - }, - "source": [ - "### Creating a Lance Dataset\n", - "\n", - "This function, `write_to_lance`, is designed to convert a PyArrow Table into a Lance dataset. It begins by defining the schema for the Lance dataset, specifying fields such as `image`, `filename`, `category`, and `data_type` , make sure the schema is the same as the one defined in the `process_images` function.\n", - "\n", - "Once the schema is established, the function determines the path for saving the Lance file, leveraging the current working directory and the provided `image_dataset` variable. It then initializes a RecordBatchReader using the defined schema and the data obtained from the `process_images` function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "IhfZI177dCEJ" - }, - "outputs": [], - "source": [ - "# Function to write PyArrow Table to Lance dataset\n", - "def write_to_lance():\n", - " # Create an empty RecordBatchIterator\n", - " schema = pa.schema([\n", - " pa.field(\"image\", pa.binary()),\n", - " pa.field(\"filename\", pa.string()),\n", - " pa.field(\"category\", pa.string()),\n", - " pa.field(\"data_type\", pa.string())\n", - " ])\n", - "\n", - " # Specify the path where you want to save the Lance file\n", - " current_dir = os.getcwd()\n", - " images_folder = os.path.join(current_dir, image_dataset)\n", - " lance_file_path = os.path.join(images_folder, f\"{image_dataset}.lance\")\n", - "\n", - " reader = pa.RecordBatchReader.from_batches(schema, process_images())\n", - " lance.write_dataset(\n", - " reader,\n", - " lance_file_path,\n", - " schema,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tW9GeYJddCEL" - }, - "source": [ - "### Load a Lance Dataset and Visualize it in Pandas Dataframe" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UQHGcmzPdCEO" - }, - "source": [ - "`loading_into_pandas` function is designed to load a Lance dataset into a Pandas dataframe. It let's you see your Lance dataset in a pandas dataframe.\n", - "\n", - "The function takes the path to the Lance file as an argument and returns a pandas dataframe. Make sure the schema is the same as the one defined during the Lance dataset generation, refer to `process_images` function and also make sure the path to the Lance file is correct." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4wwa4FQmdCEP" - }, - "outputs": [], - "source": [ - "def loading_into_pandas():\n", - " # Load Lance file from the same folder\n", - " current_dir = os.getcwd()\n", - " images_folder = os.path.join(current_dir, image_dataset)\n", - " uri = os.path.join(images_folder, \"image_dataset.lance\")\n", - "\n", - " ds = lance.dataset(uri)\n", - "\n", - " # Accumulate data from batches into a list\n", - " data = []\n", - " for batch in tqdm(ds.to_batches(columns=[\"image\", \"filename\", \"category\", \"data_type\"], batch_size=10), desc=\"Loading batches\"):\n", - " tbl = batch.to_pandas()\n", - " data.append(tbl)\n", - "\n", - " # Concatenate all DataFrames into a single DataFrame\n", - " df = pd.concat(data, ignore_index=True)\n", - " print(\"Pandas DataFrame is ready\")\n", - " print(\"Total Rows: \", df.shape[0])" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.2" - }, - "colab": { - "provenance": [] - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "UByKm8Q6dCEB" + }, + "source": [ + "### Imports" + ] }, - "nbformat": 4, - "nbformat_minor": 0 + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "J8jICxVldCEC" + }, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import pyarrow as pa\n", + "import lance\n", + "import time\n", + "from tqdm import tqdm\n", + "\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AfUoIr4TdCEF" + }, + "source": [ + "### Set the variable according to your Image dataset\n", + "\n", + "Assign the path to your image dataset to the variable `image_dataset`. This dataset should contain your images organized into training, testing, and validation folders. These images will be used to convert them into Lance format.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5JvY3ucxdCEF" + }, + "outputs": [], + "source": [ + "image_dataset = \"image_dataset\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4ULduqF2dCEG" + }, + "source": [ + "### Processing the Images\n", + "\n", + "The `process_images` function is the central component of this notebook, responsible for transforming images from the training, testing, and validation folders into Lance format. This format typically includes essential attributes such as `image`, `filename`, `category`, and `data_type`.\n", + "\n", + "Specifically, `image` represents the actual image data, `filename` denotes the name of the file, `category` indicates the category to which the image belongs, and `data_type` specifies whether the image is from the training, testing, or validation set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "S1wX3JVmdCEG" + }, + "outputs": [], + "source": [ + "def process_images():\n", + " # Get the current directory path\n", + " current_dir = os.getcwd()\n", + " images_folder = os.path.join(current_dir, image_dataset)\n", + " print(images_folder)\n", + "\n", + " # Define schema for RecordBatch\n", + " schema = pa.schema(\n", + " [\n", + " (\"image\", pa.binary()),\n", + " (\"filename\", pa.string()),\n", + " (\"category\", pa.string()),\n", + " (\"data_type\", pa.string()),\n", + " ]\n", + " )\n", + "\n", + " # Iterate over the data types (train, test, valid)\n", + " for data_type in [\"train\", \"test\", \"val\"]:\n", + " data_type_folder = os.path.join(images_folder, data_type)\n", + "\n", + " # Iterate over the categories within each data type\n", + " for category in os.listdir(data_type_folder):\n", + " category_folder = os.path.join(data_type_folder, category)\n", + "\n", + " # Iterate over the images within each category\n", + " for filename in tqdm(\n", + " os.listdir(category_folder), desc=f\"Processing {data_type} - {category}\"\n", + " ):\n", + " # Construct the full path to the image\n", + " image_path = os.path.join(category_folder, filename)\n", + "\n", + " # Read and convert the image to a binary format\n", + " with open(image_path, \"rb\") as f:\n", + " binary_data = f.read()\n", + "\n", + " image_array = pa.array([binary_data], type=pa.binary())\n", + " filename_array = pa.array([filename], type=pa.string())\n", + " category_array = pa.array([category], type=pa.string())\n", + " data_type_array = pa.array([data_type], type=pa.string())\n", + "\n", + " # Yield RecordBatch for each image\n", + " yield pa.RecordBatch.from_arrays(\n", + " [image_array, filename_array, category_array, data_type_array],\n", + " schema=schema,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Gcc4JvAYdCEI" + }, + "source": [ + "### Creating a Lance Dataset\n", + "\n", + "This function, `write_to_lance`, is designed to convert a PyArrow Table into a Lance dataset. It begins by defining the schema for the Lance dataset, specifying fields such as `image`, `filename`, `category`, and `data_type` , make sure the schema is the same as the one defined in the `process_images` function.\n", + "\n", + "Once the schema is established, the function determines the path for saving the Lance file, leveraging the current working directory and the provided `image_dataset` variable. It then initializes a RecordBatchReader using the defined schema and the data obtained from the `process_images` function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IhfZI177dCEJ" + }, + "outputs": [], + "source": [ + "# Function to write PyArrow Table to Lance dataset\n", + "def write_to_lance():\n", + " # Create an empty RecordBatchIterator\n", + " schema = pa.schema(\n", + " [\n", + " pa.field(\"image\", pa.binary()),\n", + " pa.field(\"filename\", pa.string()),\n", + " pa.field(\"category\", pa.string()),\n", + " pa.field(\"data_type\", pa.string()),\n", + " ]\n", + " )\n", + "\n", + " # Specify the path where you want to save the Lance file\n", + " current_dir = os.getcwd()\n", + " images_folder = os.path.join(current_dir, image_dataset)\n", + " lance_file_path = os.path.join(images_folder, f\"{image_dataset}.lance\")\n", + "\n", + " reader = pa.RecordBatchReader.from_batches(schema, process_images())\n", + " lance.write_dataset(\n", + " reader,\n", + " lance_file_path,\n", + " schema,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tW9GeYJddCEL" + }, + "source": [ + "### Load a Lance Dataset and Visualize it in Pandas Dataframe" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UQHGcmzPdCEO" + }, + "source": [ + "`loading_into_pandas` function is designed to load a Lance dataset into a Pandas dataframe. It let's you see your Lance dataset in a pandas dataframe.\n", + "\n", + "The function takes the path to the Lance file as an argument and returns a pandas dataframe. Make sure the schema is the same as the one defined during the Lance dataset generation, refer to `process_images` function and also make sure the path to the Lance file is correct." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4wwa4FQmdCEP" + }, + "outputs": [], + "source": [ + "def loading_into_pandas():\n", + " # Load Lance file from the same folder\n", + " current_dir = os.getcwd()\n", + " images_folder = os.path.join(current_dir, image_dataset)\n", + " uri = os.path.join(images_folder, \"image_dataset.lance\")\n", + "\n", + " ds = lance.dataset(uri)\n", + "\n", + " # Accumulate data from batches into a list\n", + " data = []\n", + " for batch in tqdm(\n", + " ds.to_batches(\n", + " columns=[\"image\", \"filename\", \"category\", \"data_type\"], batch_size=10\n", + " ),\n", + " desc=\"Loading batches\",\n", + " ):\n", + " tbl = batch.to_pandas()\n", + " data.append(tbl)\n", + "\n", + " # Concatenate all DataFrames into a single DataFrame\n", + " df = pd.concat(data, ignore_index=True)\n", + " print(\"Pandas DataFrame is ready\")\n", + " print(\"Total Rows: \", df.shape[0])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + }, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file