diff --git a/build/lib/lyzr/base/llms.py b/build/lib/lyzr/base/llms.py index 15959be..a79c7d6 100644 --- a/build/lib/lyzr/base/llms.py +++ b/build/lib/lyzr/base/llms.py @@ -1,6 +1,6 @@ # standard library imports import os -from typing import Optional, Literal +from typing import Optional, Literal, Union # third-party imports from openai import OpenAI @@ -156,7 +156,7 @@ def get_model( def set_model_params( - params: dict, model_kwargs: dict, force: bool | dict = None + params: dict, model_kwargs: dict, force: Union[bool, dict] = None ) -> dict: force = force or False for param in params: diff --git a/build/lib/lyzr/data_analyzr/analyzr.py b/build/lib/lyzr/data_analyzr/analyzr.py index c946ea7..465b29b 100644 --- a/build/lib/lyzr/data_analyzr/analyzr.py +++ b/build/lib/lyzr/data_analyzr/analyzr.py @@ -193,6 +193,9 @@ def _set_logger(self, log_level, print_log): self.logger.addHandler(handler) log_filename = self.log_filename + dir_path = os.path.dirname(log_filename) + if dir_path.strip() != "": + os.makedirs(dir_path, exist_ok=True) file_handler = logging.FileHandler( log_filename, mode="a" ) # Open the log file in append mode @@ -532,7 +535,7 @@ def tasks( self, user_input: Optional[str] = None, tasks_context: Optional[str] = None, - n_tasks: Optional[int] = 3, + n_tasks: Optional[int] = 5, # legacy usage insights: Optional[str] = None, recommendations: Optional[str] = None, diff --git a/build/lib/lyzr/data_analyzr/db_connector.py b/build/lib/lyzr/data_analyzr/db_connector.py index 323e53e..30d12ff 100644 --- a/build/lib/lyzr/data_analyzr/db_connector.py +++ b/build/lib/lyzr/data_analyzr/db_connector.py @@ -182,14 +182,15 @@ def get_default_training_plan(self): class PostgresConnector(DatabaseConnector): + def __init__( self, host: str, - port: int | str, + port: Union[int, str], database: str, user: str, password: str, - schema: str | list = None, + schema: Union[list, str] = None, tables: list[str] = None, ): self.host = host or os.getenv("POSTGRES_HOST") @@ -225,7 +226,7 @@ def __init__( def fetch_dataframes_dict( self, - schema: str | list = None, + schema: Union[str, list] = None, tables: list[str] = None, ) -> dict[pd.DataFrame]: schema = schema or self.schema or None @@ -451,7 +452,7 @@ def __init__(self, db_path: str = None): ) from e @staticmethod - def _download_db(url: str) -> str | None: + def _download_db(url: str) -> Union[str, None]: url = urlparse(url).path if os.path.exists(url): return url diff --git a/build/lib/lyzr/data_analyzr/file_utils.py b/build/lib/lyzr/data_analyzr/file_utils.py index 99cd323..8117d3f 100644 --- a/build/lib/lyzr/data_analyzr/file_utils.py +++ b/build/lib/lyzr/data_analyzr/file_utils.py @@ -1,6 +1,6 @@ # standard library imports import os -from typing import Literal +from typing import Literal, Union # third-party imports import pandas as pd @@ -91,7 +91,7 @@ def get_dict_of_files(datasets: dict, kwargs) -> dict[pd.DataFrame]: def read_file_or_folder( - name: str, filepath: str | pd.DataFrame, kwargs + name: str, filepath: Union[str, pd.DataFrame], kwargs ) -> dict[pd.DataFrame]: if isinstance(filepath, pd.DataFrame): return {name: filepath} @@ -114,7 +114,7 @@ def read_file_or_folder( ) -def get_list_of_kwargs(datasets: dict, kwargs: dict | list) -> list[dict]: +def get_list_of_kwargs(datasets: dict, kwargs: Union[dict, list]) -> list[dict]: if isinstance(kwargs, list) and len(kwargs) == len(datasets): return kwargs kwargs_list = [{} for _ in range(len(datasets))] diff --git a/build/lib/lyzr/data_analyzr/plot_utils.py b/build/lib/lyzr/data_analyzr/plot_utils.py index d955456..07a7ea9 100644 --- a/build/lib/lyzr/data_analyzr/plot_utils.py +++ b/build/lib/lyzr/data_analyzr/plot_utils.py @@ -45,20 +45,29 @@ def __init__( self.plotting_library = "matplotlib" self.output_format = "png" - self.plot_path = plot_path - if not os.path.isfile(self.plot_path): - dir_path = os.path.dirname(self.plot_path) - if dir_path.strip() != "": - os.makedirs(dir_path, exist_ok=True) - if os.path.isdir(self.plot_path): - self.plot_path = os.path.join(self.plot_path, "plot.png") - else: - self.logger.warn( - f'Incorrect path for plot image provided: {self.plot_path}. Defaulting to "generated_plots/plot.png".' - ) - self.plot_path = "generated_plots/plot.png" - if os.path.splitext(self.plot_path)[1] != ".png": - self.plot_path = os.path.join(os.path.splitext(self.plot_path)[0], ".png") + self.plot_path = self._handle_plotpath(plot_path) + + def _handle_plotpath(self, plot_path) -> str: + plot_path = PlotFactory._fix_plotpath(plot_path) + try: + open(plot_path, "w").close() + return plot_path + except Exception: + self.logger.warning( + f'Incorrect path for plot image provided: {self.plot_path}. Defaulting to "generated_plots/plot.png".' + ) + return self._handle_plotpath("generated_plots/plot.png") + + @staticmethod + def _fix_plotpath(plot_path: str) -> str: + if os.path.isdir(plot_path): + plot_path = os.path.join(plot_path, "plot.png") + if os.path.splitext(plot_path)[1] != ".png": + plot_path = os.path.splitext(plot_path)[0] + ".png" + dir_path = os.path.dirname(plot_path) + if dir_path.strip() != "": + os.makedirs(dir_path, exist_ok=True) + return plot_path def _get_plotting_guide(self, user_input: str) -> str: self.model.set_messages( @@ -297,9 +306,29 @@ def _create_plot(self, plot_details: dict, df: pd.DataFrame) -> plt.Figure: return fig def get_visualisation(self, df: pd.DataFrame) -> str: - fig = self._create_plot(self.plotting_steps, df) + self.fig = self._create_plot(self.plotting_steps, df) plt.tight_layout() - fig.savefig(self.plot_path) - plt.close(fig) - self.logger.info(f"\nPlot saved at: {self.plot_path}\n") + if not PlotFactory._savefig(self.fig, self.plot_path, self.logger): + self.logger.error( + f"Error saving plot at: {self.plot_path}. Plot not saved. Displaying plot instead. Access the plot using `.fig` attribute." + ) + plt.show() + else: + self.logger.info(f"\nPlot saved at: {self.plot_path}\n") + plt.close(self.fig) return self.plot_path + + @staticmethod + def _savefig(fig, path, logger): + try: + dir_path = os.path.dirname(path) + if dir_path.strip() != "": + os.makedirs(dir_path, exist_ok=True) + fig.savefig(path) + return True + except Exception: + logger.error( + f"Error saving plot at: {path}. Trying to save at default location: 'generated_plots/plot.png'." + ) + PlotFactory._savefig(fig, "generated_plots/plot.png", logger) + return False diff --git a/dist/lyzr-0.1.27.tar.gz b/dist/lyzr-0.1.27.tar.gz deleted file mode 100644 index c2541e8..0000000 Binary files a/dist/lyzr-0.1.27.tar.gz and /dev/null differ diff --git a/dist/lyzr-0.1.27-py3-none-any.whl b/dist/lyzr-0.1.28-py3-none-any.whl similarity index 55% rename from dist/lyzr-0.1.27-py3-none-any.whl rename to dist/lyzr-0.1.28-py3-none-any.whl index afe0207..0066a0e 100644 Binary files a/dist/lyzr-0.1.27-py3-none-any.whl and b/dist/lyzr-0.1.28-py3-none-any.whl differ diff --git a/dist/lyzr-0.1.28.tar.gz b/dist/lyzr-0.1.28.tar.gz new file mode 100644 index 0000000..83d51a4 Binary files /dev/null and b/dist/lyzr-0.1.28.tar.gz differ diff --git a/lyzr.egg-info/PKG-INFO b/lyzr.egg-info/PKG-INFO index 0d63deb..c143d21 100644 --- a/lyzr.egg-info/PKG-INFO +++ b/lyzr.egg-info/PKG-INFO @@ -1,18 +1,37 @@ Metadata-Version: 2.1 Name: lyzr -Version: 0.1.27 -Summary: UNKNOWN -Home-page: UNKNOWN +Version: 0.1.28 +Home-page: Author: lyzr -License: UNKNOWN -Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: License :: OSI Approved :: MIT License Classifier: Operating System :: OS Independent Requires-Python: >=3.8.1, <3.12 Description-Content-Type: text/markdown -Provides-Extra: data-analyzr License-File: LICENSE.txt +Requires-Dist: asyncio +Requires-Dist: nest_asyncio +Requires-Dist: openai==1.3.4 +Requires-Dist: litellm==1.2.0 +Requires-Dist: llama-index==0.9.4 +Requires-Dist: langchain==0.0.339 +Requires-Dist: python-dotenv>=1.0.0 +Requires-Dist: beautifulsoup4==4.12.2 +Requires-Dist: pandas==2.0.2 +Requires-Dist: weaviate-client==3.25.3 +Requires-Dist: llmsherpa +Provides-Extra: data-analyzr +Requires-Dist: scikit-learn==1.4.0; extra == "data-analyzr" +Requires-Dist: statsmodels==0.14.1; extra == "data-analyzr" +Requires-Dist: chromadb==0.4.22; extra == "data-analyzr" +Requires-Dist: tabulate==0.9.0; extra == "data-analyzr" +Requires-Dist: pmdarima==2.0.4; extra == "data-analyzr" +Requires-Dist: openpyxl==3.1.2; extra == "data-analyzr" +Requires-Dist: matplotlib==3.8.2; extra == "data-analyzr" +Requires-Dist: redshift_connector==2.0.918; extra == "data-analyzr" +Requires-Dist: mysql-connector-python==8.2.0; extra == "data-analyzr" +Requires-Dist: psycopg2-binary==2.9.9; extra == "data-analyzr" +Requires-Dist: snowflake-connector-python==3.6.0; extra == "data-analyzr" # lyzr @@ -73,5 +92,3 @@ Replace `[version]` with the actual version of the package you have built. ## License `lyzr` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license. - - diff --git a/lyzr.egg-info/SOURCES.txt b/lyzr.egg-info/SOURCES.txt index 77a4c3a..3bc455e 100644 --- a/lyzr.egg-info/SOURCES.txt +++ b/lyzr.egg-info/SOURCES.txt @@ -59,19 +59,6 @@ lyzr/formula_generator/formula_generator.py lyzr/qa/__init__.py lyzr/qa/qa_bot.py lyzr/qa/search_agent.py -lyzr/utils/__init__.py -lyzr/utils/chat_utils.py -lyzr/utils/constants.py -lyzr/utils/db_utils.py -lyzr/utils/document_reading.py -lyzr/utils/docx_reader.py -lyzr/utils/env_utils.py -lyzr/utils/pdf_reader.py -lyzr/utils/rag_utils.py lyzr/utils/search_utils.py -lyzr/utils/txt_reader.py -lyzr/utils/webpage_reader.py -lyzr/utils/website_reader.py -lyzr/utils/youtube_reader.py lyzr/voicebot/__init__.py lyzr/voicebot/voicebot.py \ No newline at end of file diff --git a/lyzr.egg-info/requires.txt b/lyzr.egg-info/requires.txt index 131c844..016f743 100644 --- a/lyzr.egg-info/requires.txt +++ b/lyzr.egg-info/requires.txt @@ -1,24 +1,24 @@ asyncio -beautifulsoup4==4.12.2 -langchain==0.0.339 -litellm==1.2.0 -llama-index==0.9.4 -llmsherpa nest_asyncio openai==1.3.4 -pandas==2.0.2 +litellm==1.2.0 +llama-index==0.9.4 +langchain==0.0.339 python-dotenv>=1.0.0 +beautifulsoup4==4.12.2 +pandas==2.0.2 weaviate-client==3.25.3 +llmsherpa [data-analyzr] +scikit-learn==1.4.0 +statsmodels==0.14.1 chromadb==0.4.22 +tabulate==0.9.0 +pmdarima==2.0.4 +openpyxl==3.1.2 matplotlib==3.8.2 +redshift_connector==2.0.918 mysql-connector-python==8.2.0 -openpyxl==3.1.2 -pmdarima==2.0.4 psycopg2-binary==2.9.9 -redshift_connector==2.0.918 -scikit-learn==1.4.0 snowflake-connector-python==3.6.0 -statsmodels==0.14.1 -tabulate==0.9.0 diff --git a/lyzr/base/llms.py b/lyzr/base/llms.py index 15959be..a79c7d6 100644 --- a/lyzr/base/llms.py +++ b/lyzr/base/llms.py @@ -1,6 +1,6 @@ # standard library imports import os -from typing import Optional, Literal +from typing import Optional, Literal, Union # third-party imports from openai import OpenAI @@ -156,7 +156,7 @@ def get_model( def set_model_params( - params: dict, model_kwargs: dict, force: bool | dict = None + params: dict, model_kwargs: dict, force: Union[bool, dict] = None ) -> dict: force = force or False for param in params: diff --git a/lyzr/data_analyzr/analyzr.py b/lyzr/data_analyzr/analyzr.py index c946ea7..465b29b 100644 --- a/lyzr/data_analyzr/analyzr.py +++ b/lyzr/data_analyzr/analyzr.py @@ -193,6 +193,9 @@ def _set_logger(self, log_level, print_log): self.logger.addHandler(handler) log_filename = self.log_filename + dir_path = os.path.dirname(log_filename) + if dir_path.strip() != "": + os.makedirs(dir_path, exist_ok=True) file_handler = logging.FileHandler( log_filename, mode="a" ) # Open the log file in append mode @@ -532,7 +535,7 @@ def tasks( self, user_input: Optional[str] = None, tasks_context: Optional[str] = None, - n_tasks: Optional[int] = 3, + n_tasks: Optional[int] = 5, # legacy usage insights: Optional[str] = None, recommendations: Optional[str] = None, diff --git a/lyzr/data_analyzr/db_connector.py b/lyzr/data_analyzr/db_connector.py index 323e53e..30d12ff 100644 --- a/lyzr/data_analyzr/db_connector.py +++ b/lyzr/data_analyzr/db_connector.py @@ -182,14 +182,15 @@ def get_default_training_plan(self): class PostgresConnector(DatabaseConnector): + def __init__( self, host: str, - port: int | str, + port: Union[int, str], database: str, user: str, password: str, - schema: str | list = None, + schema: Union[list, str] = None, tables: list[str] = None, ): self.host = host or os.getenv("POSTGRES_HOST") @@ -225,7 +226,7 @@ def __init__( def fetch_dataframes_dict( self, - schema: str | list = None, + schema: Union[str, list] = None, tables: list[str] = None, ) -> dict[pd.DataFrame]: schema = schema or self.schema or None @@ -451,7 +452,7 @@ def __init__(self, db_path: str = None): ) from e @staticmethod - def _download_db(url: str) -> str | None: + def _download_db(url: str) -> Union[str, None]: url = urlparse(url).path if os.path.exists(url): return url diff --git a/lyzr/data_analyzr/file_utils.py b/lyzr/data_analyzr/file_utils.py index 99cd323..8117d3f 100644 --- a/lyzr/data_analyzr/file_utils.py +++ b/lyzr/data_analyzr/file_utils.py @@ -1,6 +1,6 @@ # standard library imports import os -from typing import Literal +from typing import Literal, Union # third-party imports import pandas as pd @@ -91,7 +91,7 @@ def get_dict_of_files(datasets: dict, kwargs) -> dict[pd.DataFrame]: def read_file_or_folder( - name: str, filepath: str | pd.DataFrame, kwargs + name: str, filepath: Union[str, pd.DataFrame], kwargs ) -> dict[pd.DataFrame]: if isinstance(filepath, pd.DataFrame): return {name: filepath} @@ -114,7 +114,7 @@ def read_file_or_folder( ) -def get_list_of_kwargs(datasets: dict, kwargs: dict | list) -> list[dict]: +def get_list_of_kwargs(datasets: dict, kwargs: Union[dict, list]) -> list[dict]: if isinstance(kwargs, list) and len(kwargs) == len(datasets): return kwargs kwargs_list = [{} for _ in range(len(datasets))] diff --git a/lyzr/data_analyzr/plot_utils.py b/lyzr/data_analyzr/plot_utils.py index d955456..07a7ea9 100644 --- a/lyzr/data_analyzr/plot_utils.py +++ b/lyzr/data_analyzr/plot_utils.py @@ -45,20 +45,29 @@ def __init__( self.plotting_library = "matplotlib" self.output_format = "png" - self.plot_path = plot_path - if not os.path.isfile(self.plot_path): - dir_path = os.path.dirname(self.plot_path) - if dir_path.strip() != "": - os.makedirs(dir_path, exist_ok=True) - if os.path.isdir(self.plot_path): - self.plot_path = os.path.join(self.plot_path, "plot.png") - else: - self.logger.warn( - f'Incorrect path for plot image provided: {self.plot_path}. Defaulting to "generated_plots/plot.png".' - ) - self.plot_path = "generated_plots/plot.png" - if os.path.splitext(self.plot_path)[1] != ".png": - self.plot_path = os.path.join(os.path.splitext(self.plot_path)[0], ".png") + self.plot_path = self._handle_plotpath(plot_path) + + def _handle_plotpath(self, plot_path) -> str: + plot_path = PlotFactory._fix_plotpath(plot_path) + try: + open(plot_path, "w").close() + return plot_path + except Exception: + self.logger.warning( + f'Incorrect path for plot image provided: {self.plot_path}. Defaulting to "generated_plots/plot.png".' + ) + return self._handle_plotpath("generated_plots/plot.png") + + @staticmethod + def _fix_plotpath(plot_path: str) -> str: + if os.path.isdir(plot_path): + plot_path = os.path.join(plot_path, "plot.png") + if os.path.splitext(plot_path)[1] != ".png": + plot_path = os.path.splitext(plot_path)[0] + ".png" + dir_path = os.path.dirname(plot_path) + if dir_path.strip() != "": + os.makedirs(dir_path, exist_ok=True) + return plot_path def _get_plotting_guide(self, user_input: str) -> str: self.model.set_messages( @@ -297,9 +306,29 @@ def _create_plot(self, plot_details: dict, df: pd.DataFrame) -> plt.Figure: return fig def get_visualisation(self, df: pd.DataFrame) -> str: - fig = self._create_plot(self.plotting_steps, df) + self.fig = self._create_plot(self.plotting_steps, df) plt.tight_layout() - fig.savefig(self.plot_path) - plt.close(fig) - self.logger.info(f"\nPlot saved at: {self.plot_path}\n") + if not PlotFactory._savefig(self.fig, self.plot_path, self.logger): + self.logger.error( + f"Error saving plot at: {self.plot_path}. Plot not saved. Displaying plot instead. Access the plot using `.fig` attribute." + ) + plt.show() + else: + self.logger.info(f"\nPlot saved at: {self.plot_path}\n") + plt.close(self.fig) return self.plot_path + + @staticmethod + def _savefig(fig, path, logger): + try: + dir_path = os.path.dirname(path) + if dir_path.strip() != "": + os.makedirs(dir_path, exist_ok=True) + fig.savefig(path) + return True + except Exception: + logger.error( + f"Error saving plot at: {path}. Trying to save at default location: 'generated_plots/plot.png'." + ) + PlotFactory._savefig(fig, "generated_plots/plot.png", logger) + return False diff --git a/lyzr/utils/__init__.py b/lyzr/utils/__init__.py deleted file mode 100644 index 242c48e..0000000 --- a/lyzr/utils/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -from lyzr.utils.document_reading import ( - read_pdf_as_documents, - read_txt_as_documents, - read_docx_as_documents, - read_webpage_as_documents, - read_website_as_documents, - read_youtube_as_documents, -) - -__all__ = [ - "read_pdf_as_documents", - "read_txt_as_documents", - "read_docx_as_documents", - "read_webpage_as_documents", - "read_website_as_documents", - "read_youtube_as_documents", -] diff --git a/lyzr/utils/chat_utils.py b/lyzr/utils/chat_utils.py deleted file mode 100644 index 1466b3f..0000000 --- a/lyzr/utils/chat_utils.py +++ /dev/null @@ -1,464 +0,0 @@ -from typing import Union, Optional, List - -from llama_index.chat_engine.types import BaseChatEngine, ChatMode -from llama_index.embeddings.utils import EmbedType -from llama_index.chat_engine import ContextChatEngine -from llama_index.memory import ChatMemoryBuffer - -from lyzr.base.llm import LyzrLLMFactory -from lyzr.base.service import LyzrService -from lyzr.base.vector_store import LyzrVectorStoreIndex -from lyzr.base.retrievers import LyzrRetriever - -from lyzr.utils.document_reading import ( - read_pdf_as_documents, - read_docx_as_documents, - read_txt_as_documents, - read_website_as_documents, - read_webpage_as_documents, - read_youtube_as_documents, -) - - -def pdf_chat_( - input_dir: Optional[str] = None, - input_files: Optional[List] = None, - exclude_hidden: bool = True, - filename_as_id: bool = True, - recursive: bool = True, - required_exts: Optional[List[str]] = None, - system_prompt: str = None, - query_wrapper_prompt: str = None, - embed_model: Union[str, EmbedType] = "default", - llm_params: dict = None, - vector_store_params: dict = None, - service_context_params: dict = None, - chat_engine_params: dict = None, - retriever_params: dict = None, -) -> BaseChatEngine: - documents = read_pdf_as_documents( - input_dir=input_dir, - input_files=input_files, - exclude_hidden=exclude_hidden, - filename_as_id=filename_as_id, - recursive=recursive, - required_exts=required_exts, - ) - - llm_params = ( - { - "model": "gpt-4-0125-preview", - "temperature": 0, - } - if llm_params is None - else llm_params - ) - vector_store_params = ( - {"vector_store_type": "WeaviateVectorStore"} - if vector_store_params is None - else vector_store_params - ) - service_context_params = ( - {} if service_context_params is None else service_context_params - ) - chat_engine_params = {} if chat_engine_params is None else chat_engine_params - - retriever_params = ( - {"retriever_type": "QueryFusionRetriever"} - if retriever_params is None - else retriever_params - ) - - llm = LyzrLLMFactory.from_defaults(**llm_params) - service_context = LyzrService.from_defaults( - llm=llm, - embed_model=embed_model, - system_prompt=system_prompt, - query_wrapper_prompt=query_wrapper_prompt, - **service_context_params, - ) - - vector_store_index = LyzrVectorStoreIndex.from_defaults( - **vector_store_params, documents=documents, service_context=service_context - ) - - retriever = LyzrRetriever.from_defaults( - **retriever_params, base_index=vector_store_index - ) - - memory = ChatMemoryBuffer.from_defaults(token_limit=4000) - - chat_engine = ContextChatEngine( - llm=llm, - memory=memory, - retriever=retriever, - prefix_messages=list(), - **chat_engine_params, - ) - - return chat_engine - - -def txt_chat_( - input_dir: Optional[str] = None, - input_files: Optional[List] = None, - exclude_hidden: bool = True, - filename_as_id: bool = True, - recursive: bool = True, - required_exts: Optional[List[str]] = None, - system_prompt: str = None, - query_wrapper_prompt: str = None, - embed_model: Union[str, EmbedType] = "default", - llm_params: dict = None, - vector_store_params: dict = None, - service_context_params: dict = None, - chat_engine_params: dict = None, - retriever_params: dict = None, -) -> BaseChatEngine: - documents = read_txt_as_documents( - input_dir=input_dir, - input_files=input_files, - exclude_hidden=exclude_hidden, - filename_as_id=filename_as_id, - recursive=recursive, - required_exts=required_exts, - ) - - llm_params = ( - { - "model": "gpt-4-0125-preview", - "temperature": 0, - } - if llm_params is None - else llm_params - ) - vector_store_params = ( - {"vector_store_type": "WeaviateVectorStore"} - if vector_store_params is None - else vector_store_params - ) - service_context_params = ( - {} if service_context_params is None else service_context_params - ) - chat_engine_params = {} if chat_engine_params is None else chat_engine_params - - retriever_params = ( - {"retriever_type": "QueryFusionRetriever"} - if retriever_params is None - else retriever_params - ) - - llm = LyzrLLMFactory.from_defaults(**llm_params) - service_context = LyzrService.from_defaults( - llm=llm, - embed_model=embed_model, - system_prompt=system_prompt, - query_wrapper_prompt=query_wrapper_prompt, - **service_context_params, - ) - - vector_store_index = LyzrVectorStoreIndex.from_defaults( - **vector_store_params, documents=documents, service_context=service_context - ) - - retriever = LyzrRetriever.from_defaults( - **retriever_params, base_index=vector_store_index - ) - - memory = ChatMemoryBuffer.from_defaults(token_limit=4000) - - chat_engine = ContextChatEngine( - llm=llm, - memory=memory, - retriever=retriever, - prefix_messages=list(), - **chat_engine_params, - ) - - return chat_engine - - -def docx_chat_( - input_dir: Optional[str] = None, - input_files: Optional[List] = None, - exclude_hidden: bool = True, - filename_as_id: bool = True, - recursive: bool = True, - required_exts: Optional[List[str]] = None, - system_prompt: str = None, - query_wrapper_prompt: str = None, - embed_model: Union[str, EmbedType] = "default", - llm_params: dict = None, - vector_store_params: dict = None, - service_context_params: dict = None, - chat_engine_params: dict = None, - retriever_params: dict = None, -) -> BaseChatEngine: - documents = read_docx_as_documents( - input_dir=input_dir, - input_files=input_files, - exclude_hidden=exclude_hidden, - filename_as_id=filename_as_id, - recursive=recursive, - required_exts=required_exts, - ) - - llm_params = ( - { - "model": "gpt-4-0125-preview", - "temperature": 0, - } - if llm_params is None - else llm_params - ) - vector_store_params = ( - {"vector_store_type": "WeaviateVectorStore"} - if vector_store_params is None - else vector_store_params - ) - service_context_params = ( - {} if service_context_params is None else service_context_params - ) - chat_engine_params = {} if chat_engine_params is None else chat_engine_params - - retriever_params = ( - {"retriever_type": "QueryFusionRetriever"} - if retriever_params is None - else retriever_params - ) - - llm = LyzrLLMFactory.from_defaults(**llm_params) - service_context = LyzrService.from_defaults( - llm=llm, - embed_model=embed_model, - system_prompt=system_prompt, - query_wrapper_prompt=query_wrapper_prompt, - **service_context_params, - ) - - vector_store_index = LyzrVectorStoreIndex.from_defaults( - **vector_store_params, documents=documents, service_context=service_context - ) - - retriever = LyzrRetriever.from_defaults( - **retriever_params, base_index=vector_store_index - ) - - memory = ChatMemoryBuffer.from_defaults(token_limit=4000) - - chat_engine = ContextChatEngine( - llm=llm, - memory=memory, - retriever=retriever, - prefix_messages=list(), - **chat_engine_params, - ) - - return chat_engine - - -def webpage_chat_( - url: str = None, - system_prompt: str = None, - query_wrapper_prompt: str = None, - embed_model: Union[str, EmbedType] = "default", - llm_params: dict = None, - vector_store_params: dict = None, - service_context_params: dict = None, - chat_engine_params: dict = None, - retriever_params: dict = None, -) -> BaseChatEngine: - documents = read_webpage_as_documents( - url=url, - ) - - llm_params = ( - { - "model": "gpt-4-0125-preview", - "temperature": 0, - } - if llm_params is None - else llm_params - ) - vector_store_params = ( - {"vector_store_type": "WeaviateVectorStore"} - if vector_store_params is None - else vector_store_params - ) - service_context_params = ( - {} if service_context_params is None else service_context_params - ) - chat_engine_params = {} if chat_engine_params is None else chat_engine_params - - retriever_params = ( - {"retriever_type": "QueryFusionRetriever"} - if retriever_params is None - else retriever_params - ) - - llm = LyzrLLMFactory.from_defaults(**llm_params) - service_context = LyzrService.from_defaults( - llm=llm, - embed_model=embed_model, - system_prompt=system_prompt, - query_wrapper_prompt=query_wrapper_prompt, - **service_context_params, - ) - - vector_store_index = LyzrVectorStoreIndex.from_defaults( - **vector_store_params, documents=documents, service_context=service_context - ) - - retriever = LyzrRetriever.from_defaults( - **retriever_params, base_index=vector_store_index - ) - - memory = ChatMemoryBuffer.from_defaults(token_limit=4000) - - chat_engine = ContextChatEngine( - llm=llm, - memory=memory, - retriever=retriever, - prefix_messages=list(), - **chat_engine_params, - ) - - return chat_engine - - -def website_chat_( - url: str = None, - system_prompt: str = None, - query_wrapper_prompt: str = None, - embed_model: Union[str, EmbedType] = "default", - llm_params: dict = None, - vector_store_params: dict = None, - service_context_params: dict = None, - chat_engine_params: dict = None, - retriever_params: dict = None, -) -> BaseChatEngine: - documents = read_website_as_documents( - url=url, - ) - - llm_params = ( - { - "model": "gpt-4-0125-preview", - "temperature": 0, - } - if llm_params is None - else llm_params - ) - vector_store_params = ( - {"vector_store_type": "WeaviateVectorStore"} - if vector_store_params is None - else vector_store_params - ) - service_context_params = ( - {} if service_context_params is None else service_context_params - ) - chat_engine_params = {} if chat_engine_params is None else chat_engine_params - - retriever_params = ( - {"retriever_type": "QueryFusionRetriever"} - if retriever_params is None - else retriever_params - ) - - llm = LyzrLLMFactory.from_defaults(**llm_params) - service_context = LyzrService.from_defaults( - llm=llm, - embed_model=embed_model, - system_prompt=system_prompt, - query_wrapper_prompt=query_wrapper_prompt, - **service_context_params, - ) - - vector_store_index = LyzrVectorStoreIndex.from_defaults( - **vector_store_params, documents=documents, service_context=service_context - ) - - retriever = LyzrRetriever.from_defaults( - **retriever_params, base_index=vector_store_index - ) - - memory = ChatMemoryBuffer.from_defaults(token_limit=4000) - - chat_engine = ContextChatEngine( - llm=llm, - memory=memory, - retriever=retriever, - prefix_messages=list(), - **chat_engine_params, - ) - - return chat_engine - - -def youtube_chat_( - urls: List[str] = None, - system_prompt: str = None, - query_wrapper_prompt: str = None, - embed_model: Union[str, EmbedType] = "default", - llm_params: dict = None, - vector_store_params: dict = None, - service_context_params: dict = None, - chat_engine_params: dict = None, - retriever_params: dict = None, -) -> BaseChatEngine: - documents = read_youtube_as_documents( - urls=urls, - ) - - llm_params = ( - { - "model": "gpt-4-0125-preview", - "temperature": 0, - } - if llm_params is None - else llm_params - ) - vector_store_params = ( - {"vector_store_type": "WeaviateVectorStore"} - if vector_store_params is None - else vector_store_params - ) - service_context_params = ( - {} if service_context_params is None else service_context_params - ) - chat_engine_params = {} if chat_engine_params is None else chat_engine_params - - retriever_params = ( - {"retriever_type": "QueryFusionRetriever"} - if retriever_params is None - else retriever_params - ) - - llm = LyzrLLMFactory.from_defaults(**llm_params) - service_context = LyzrService.from_defaults( - llm=llm, - embed_model=embed_model, - system_prompt=system_prompt, - query_wrapper_prompt=query_wrapper_prompt, - **service_context_params, - ) - - vector_store_index = LyzrVectorStoreIndex.from_defaults( - **vector_store_params, documents=documents, service_context=service_context - ) - - retriever = LyzrRetriever.from_defaults( - **retriever_params, base_index=vector_store_index - ) - - memory = ChatMemoryBuffer.from_defaults(token_limit=4000) - - chat_engine = ContextChatEngine( - llm=llm, - memory=memory, - retriever=retriever, - prefix_messages=list(), - **chat_engine_params, - ) - - return chat_engine diff --git a/lyzr/utils/constants.py b/lyzr/utils/constants.py deleted file mode 100644 index e69de29..0000000 diff --git a/lyzr/utils/db_utils.py b/lyzr/utils/db_utils.py deleted file mode 100644 index e69de29..0000000 diff --git a/lyzr/utils/document_reading.py b/lyzr/utils/document_reading.py deleted file mode 100644 index ca91750..0000000 --- a/lyzr/utils/document_reading.py +++ /dev/null @@ -1,118 +0,0 @@ -import logging -from typing import List, Sequence, Optional - -from llama_index.readers.file.base import SimpleDirectoryReader -from llama_index.schema import Document - -from lyzr.utils.docx_reader import LyzrDocxReader -from lyzr.utils.pdf_reader import LyzrPDFReader -from lyzr.utils.txt_reader import LyzrTxtReader -from lyzr.utils.webpage_reader import LyzrWebPageReader -from lyzr.utils.website_reader import LyzrWebsiteReader -from lyzr.utils.youtube_reader import LyzrYoutubeReader - -logger = logging.getLogger(__name__) - - -def read_pdf_as_documents( - input_dir: Optional[str] = None, - input_files: Optional[List] = None, - exclude_hidden: bool = True, - filename_as_id: bool = True, - recursive: bool = True, - required_exts: Optional[List[str]] = None, - **kwargs, -) -> Sequence[Document]: - file_extractor = {".pdf": LyzrPDFReader()} - - reader = SimpleDirectoryReader( - input_dir=input_dir, - exclude_hidden=exclude_hidden, - file_extractor=file_extractor, - input_files=input_files, - filename_as_id=filename_as_id, - recursive=recursive, - required_exts=required_exts, - **kwargs, - ) - - documents = reader.load_data() - - logger.info(f"Found {len(documents)} 'documents'.") - return documents - - -def read_docx_as_documents( - input_dir: Optional[str] = None, - input_files: Optional[List] = None, - exclude_hidden: bool = True, - filename_as_id: bool = True, - recursive: bool = True, - required_exts: Optional[List[str]] = None, - **kwargs, -) -> Sequence[Document]: - file_extractor = {".docx": LyzrDocxReader()} - - reader = SimpleDirectoryReader( - input_dir=input_dir, - exclude_hidden=exclude_hidden, - file_extractor=file_extractor, - input_files=input_files, - filename_as_id=filename_as_id, - recursive=recursive, - required_exts=required_exts, - **kwargs, - ) - - documents = reader.load_data() - - logger.info(f"Found {len(documents)} 'documents'.") - return documents - - -def read_txt_as_documents( - input_dir: Optional[str] = None, - input_files: Optional[List] = None, - exclude_hidden: bool = True, - filename_as_id: bool = True, - recursive: bool = True, - required_exts: Optional[List[str]] = None, - **kwargs, -) -> Sequence[Document]: - file_extractor = {".txt": LyzrTxtReader()} - - reader = SimpleDirectoryReader( - input_dir=input_dir, - exclude_hidden=exclude_hidden, - file_extractor=file_extractor, - input_files=input_files, - filename_as_id=filename_as_id, - recursive=recursive, - required_exts=required_exts, - **kwargs, - ) - - documents = reader.load_data() - - logger.info(f"Found {len(documents)} 'documents'.") - return documents - - -def read_website_as_documents(url: str) -> List[Document]: - reader = LyzrWebsiteReader() - documents = reader.load_data(url) - return documents - - -def read_webpage_as_documents(url: str) -> List[Document]: - reader = LyzrWebPageReader() - documents = reader.load_data(url) - return documents - - -def read_youtube_as_documents( - urls: List[str] = None, -) -> List[Document]: - reader = LyzrYoutubeReader() - documents = reader.load_data(urls) - return documents diff --git a/lyzr/utils/docx_reader.py b/lyzr/utils/docx_reader.py deleted file mode 100644 index b4bcab6..0000000 --- a/lyzr/utils/docx_reader.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing import List - -from langchain.document_loaders import Docx2txtLoader -from llama_index.readers.base import BaseReader -from llama_index.schema import Document - - -class LyzrDocxReader(BaseReader): - def __init__(self) -> None: - try: - import docx2txt - except ImportError: - raise ImportError( - "`docx2txt` package not found, please run `pip install docx2txt`" - ) - - def load_data(self, file_path: str, extra_info: dict = None) -> List[Document]: - loader = Docx2txtLoader(str(file_path)) - langchain_documents = loader.load() - - documents = [] - for langchain_document in langchain_documents: - doc = Document.from_langchain_format(langchain_document) - - if extra_info is not None: - doc.metadata.update(extra_info) - - documents.append(doc) - - return documents diff --git a/lyzr/utils/env_utils.py b/lyzr/utils/env_utils.py deleted file mode 100644 index e69de29..0000000 diff --git a/lyzr/utils/pdf_reader.py b/lyzr/utils/pdf_reader.py deleted file mode 100644 index b0df6d9..0000000 --- a/lyzr/utils/pdf_reader.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import List - -from llmsherpa.readers import LayoutPDFReader -from llama_index.readers.base import BaseReader -from llama_index.schema import Document - - -class LyzrPDFReader(BaseReader): - def __init__(self) -> None: - try: - from llmsherpa.readers import LayoutPDFReader - except ImportError: - raise ImportError( - "`llmsherpa` package not found, please install it with " - "`pip install llmsherpa`" - ) - - def load_data(self, file_path: str, extra_info: dict = None) -> List[Document]: - llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all" - loader = LayoutPDFReader(llmsherpa_api_url) - - doc = loader.read_pdf(str(file_path)) - metadata = {"source": str(file_path)} - documents = [] - for chunk in doc.chunks(): - document = Document(text=chunk.to_context_text(), metadata=metadata) - documents.append(document) - - return documents diff --git a/lyzr/utils/rag_utils.py b/lyzr/utils/rag_utils.py deleted file mode 100644 index d8da770..0000000 --- a/lyzr/utils/rag_utils.py +++ /dev/null @@ -1,444 +0,0 @@ -from typing import Union, Optional, List - -from llama_index.embeddings.utils import EmbedType -from llama_index.indices.query.base import BaseQueryEngine -from llama_index.query_engine import RetrieverQueryEngine - -from lyzr.base.llm import LyzrLLMFactory -from lyzr.base.retrievers import LyzrRetriever -from lyzr.base.service import LyzrService -from lyzr.base.vector_store import LyzrVectorStoreIndex -from lyzr.utils.document_reading import ( - read_pdf_as_documents, - read_docx_as_documents, - read_txt_as_documents, - read_website_as_documents, - read_webpage_as_documents, - read_youtube_as_documents, -) - - -def pdf_rag( - input_dir: Optional[str] = None, - input_files: Optional[List] = None, - exclude_hidden: bool = True, - filename_as_id: bool = True, - recursive: bool = True, - required_exts: Optional[List[str]] = None, - system_prompt: str = None, - query_wrapper_prompt: str = None, - embed_model: Union[str, EmbedType] = "default", - llm_params: dict = None, - vector_store_params: dict = None, - service_context_params: dict = None, - query_engine_params: dict = None, - retriever_params: dict = None, -) -> BaseQueryEngine: - documents = read_pdf_as_documents( - input_dir=input_dir, - input_files=input_files, - exclude_hidden=exclude_hidden, - filename_as_id=filename_as_id, - recursive=recursive, - required_exts=required_exts, - ) - - llm_params = ( - { - "model": "gpt-4-0125-preview", - "temperature": 0, - } - if llm_params is None - else llm_params - ) - vector_store_params = ( - {"vector_store_type": "WeaviateVectorStore"} - if vector_store_params is None - else vector_store_params - ) - service_context_params = ( - {} if service_context_params is None else service_context_params - ) - query_engine_params = {} if query_engine_params is None else query_engine_params - - retriever_params = ( - {"retriever_type": "QueryFusionRetriever"} - if retriever_params is None - else retriever_params - ) - - llm = LyzrLLMFactory.from_defaults(**llm_params) - - service_context = LyzrService.from_defaults( - llm=llm, - embed_model=embed_model, - system_prompt=system_prompt, - query_wrapper_prompt=query_wrapper_prompt, - **service_context_params, - ) - - vector_store_index = LyzrVectorStoreIndex.from_defaults( - **vector_store_params, - documents=documents, - service_context=service_context, - similarity_top_k=10, - ) - - # retriever = LyzrRetriever.from_defaults( - # **retriever_params, base_index=vector_store_index - # ) - - # query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params) - query_engine = vector_store_index.as_query_engine(similarity_top_k=10) - - return query_engine - - -def txt_rag( - input_dir: Optional[str] = None, - input_files: Optional[List] = None, - exclude_hidden: bool = True, - filename_as_id: bool = True, - recursive: bool = True, - required_exts: Optional[List[str]] = None, - system_prompt: str = None, - query_wrapper_prompt: str = None, - embed_model: Union[str, EmbedType] = "default", - llm_params: dict = None, - vector_store_params: dict = None, - service_context_params: dict = None, - query_engine_params: dict = None, - retriever_params: dict = None, -) -> BaseQueryEngine: - documents = read_txt_as_documents( - input_dir=input_dir, - input_files=input_files, - exclude_hidden=exclude_hidden, - filename_as_id=filename_as_id, - recursive=recursive, - required_exts=required_exts, - ) - - llm_params = ( - { - "model": "gpt-4-0125-preview", - "temperature": 0, - } - if llm_params is None - else llm_params - ) - vector_store_params = ( - {"vector_store_type": "WeaviateVectorStore"} - if vector_store_params is None - else vector_store_params - ) - service_context_params = ( - {} if service_context_params is None else service_context_params - ) - query_engine_params = {} if query_engine_params is None else query_engine_params - - retriever_params = ( - {"retriever_type": "QueryFusionRetriever"} - if retriever_params is None - else retriever_params - ) - - llm = LyzrLLMFactory.from_defaults(**llm_params) - - service_context = LyzrService.from_defaults( - llm=llm, - embed_model=embed_model, - system_prompt=system_prompt, - query_wrapper_prompt=query_wrapper_prompt, - **service_context_params, - ) - - vector_store_index = LyzrVectorStoreIndex.from_defaults( - **vector_store_params, - documents=documents, - service_context=service_context, - similarity_top_k=10, - ) - - # retriever = LyzrRetriever.from_defaults( - # **retriever_params, base_index=vector_store_index - # ) - - # query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params) - query_engine = vector_store_index.as_query_engine(similarity_top_k=10) - - return query_engine - - -def docx_rag( - input_dir: Optional[str] = None, - input_files: Optional[List] = None, - exclude_hidden: bool = True, - filename_as_id: bool = True, - recursive: bool = True, - required_exts: Optional[List[str]] = None, - system_prompt: str = None, - query_wrapper_prompt: str = None, - embed_model: Union[str, EmbedType] = "default", - llm_params: dict = None, - vector_store_params: dict = None, - service_context_params: dict = None, - query_engine_params: dict = None, - retriever_params: dict = None, -) -> BaseQueryEngine: - documents = read_docx_as_documents( - input_dir=input_dir, - input_files=input_files, - exclude_hidden=exclude_hidden, - filename_as_id=filename_as_id, - recursive=recursive, - required_exts=required_exts, - ) - - llm_params = ( - { - "model": "gpt-4-0125-preview", - "temperature": 0, - } - if llm_params is None - else llm_params - ) - vector_store_params = ( - {"vector_store_type": "WeaviateVectorStore"} - if vector_store_params is None - else vector_store_params - ) - service_context_params = ( - {} if service_context_params is None else service_context_params - ) - query_engine_params = {} if query_engine_params is None else query_engine_params - - retriever_params = ( - {"retriever_type": "QueryFusionRetriever"} - if retriever_params is None - else retriever_params - ) - - llm = LyzrLLMFactory.from_defaults(**llm_params) - - service_context = LyzrService.from_defaults( - llm=llm, - embed_model=embed_model, - system_prompt=system_prompt, - query_wrapper_prompt=query_wrapper_prompt, - **service_context_params, - ) - - vector_store_index = LyzrVectorStoreIndex.from_defaults( - **vector_store_params, - documents=documents, - service_context=service_context, - similarity_top_k=10, - ) - - # retriever = LyzrRetriever.from_defaults( - # **retriever_params, base_index=vector_store_index - # ) - - # query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params) - query_engine = vector_store_index.as_query_engine(similarity_top_k=10) - - return query_engine - - -def webpage_rag( - url: str = None, - system_prompt: str = None, - query_wrapper_prompt: str = None, - embed_model: Union[str, EmbedType] = "default", - llm_params: dict = None, - vector_store_params: dict = None, - service_context_params: dict = None, - query_engine_params: dict = None, - retriever_params: dict = None, -) -> BaseQueryEngine: - documents = read_webpage_as_documents( - url=url, - ) - - llm_params = ( - { - "model": "gpt-4-0125-preview", - "temperature": 0, - } - if llm_params is None - else llm_params - ) - vector_store_params = ( - {"vector_store_type": "WeaviateVectorStore"} - if vector_store_params is None - else vector_store_params - ) - service_context_params = ( - {} if service_context_params is None else service_context_params - ) - query_engine_params = {} if query_engine_params is None else query_engine_params - - retriever_params = ( - {"retriever_type": "QueryFusionRetriever"} - if retriever_params is None - else retriever_params - ) - - llm = LyzrLLMFactory.from_defaults(**llm_params) - - service_context = LyzrService.from_defaults( - llm=llm, - embed_model=embed_model, - system_prompt=system_prompt, - query_wrapper_prompt=query_wrapper_prompt, - **service_context_params, - ) - - vector_store_index = LyzrVectorStoreIndex.from_defaults( - **vector_store_params, - documents=documents, - service_context=service_context, - similarity_top_k=10, - ) - - # retriever = LyzrRetriever.from_defaults( - # **retriever_params, base_index=vector_store_index - # ) - - # query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params) - query_engine = vector_store_index.as_query_engine(similarity_top_k=10) - - return query_engine - - -def website_rag( - url: str = None, - system_prompt: str = None, - query_wrapper_prompt: str = None, - embed_model: Union[str, EmbedType] = "default", - llm_params: dict = None, - vector_store_params: dict = None, - service_context_params: dict = None, - query_engine_params: dict = None, - retriever_params: dict = None, -) -> BaseQueryEngine: - documents = read_website_as_documents( - url=url, - ) - - llm_params = ( - { - "model": "gpt-4-0125-preview", - "temperature": 0, - } - if llm_params is None - else llm_params - ) - vector_store_params = ( - {"vector_store_type": "WeaviateVectorStore"} - if vector_store_params is None - else vector_store_params - ) - service_context_params = ( - {} if service_context_params is None else service_context_params - ) - query_engine_params = {} if query_engine_params is None else query_engine_params - - retriever_params = ( - {"retriever_type": "QueryFusionRetriever"} - if retriever_params is None - else retriever_params - ) - - llm = LyzrLLMFactory.from_defaults(**llm_params) - - service_context = LyzrService.from_defaults( - llm=llm, - embed_model=embed_model, - system_prompt=system_prompt, - query_wrapper_prompt=query_wrapper_prompt, - **service_context_params, - ) - - vector_store_index = LyzrVectorStoreIndex.from_defaults( - **vector_store_params, - documents=documents, - service_context=service_context, - similarity_top_k=10, - ) - - # retriever = LyzrRetriever.from_defaults( - # **retriever_params, base_index=vector_store_index - # ) - - # query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params) - query_engine = vector_store_index.as_query_engine(similarity_top_k=10) - - return query_engine - - -def youtube_rag( - urls: List[str] = None, - system_prompt: str = None, - query_wrapper_prompt: str = None, - embed_model: Union[str, EmbedType] = "default", - llm_params: dict = None, - vector_store_params: dict = None, - service_context_params: dict = None, - query_engine_params: dict = None, - retriever_params: dict = None, -) -> BaseQueryEngine: - documents = read_youtube_as_documents( - urls=urls, - ) - - llm_params = ( - { - "model": "gpt-4-0125-preview", - "temperature": 0, - } - if llm_params is None - else llm_params - ) - vector_store_params = ( - {"vector_store_type": "WeaviateVectorStore"} - if vector_store_params is None - else vector_store_params - ) - service_context_params = ( - {} if service_context_params is None else service_context_params - ) - query_engine_params = {} if query_engine_params is None else query_engine_params - - retriever_params = ( - {"retriever_type": "QueryFusionRetriever"} - if retriever_params is None - else retriever_params - ) - - llm = LyzrLLMFactory.from_defaults(**llm_params) - - service_context = LyzrService.from_defaults( - llm=llm, - embed_model=embed_model, - system_prompt=system_prompt, - query_wrapper_prompt=query_wrapper_prompt, - **service_context_params, - ) - - vector_store_index = LyzrVectorStoreIndex.from_defaults( - **vector_store_params, - documents=documents, - service_context=service_context, - similarity_top_k=10, - ) - - # retriever = LyzrRetriever.from_defaults( - # **retriever_params, base_index=vector_store_index - # ) - - # query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params) - query_engine = vector_store_index.as_query_engine(similarity_top_k=10) - - return query_engine diff --git a/lyzr/utils/txt_reader.py b/lyzr/utils/txt_reader.py deleted file mode 100644 index a83db1a..0000000 --- a/lyzr/utils/txt_reader.py +++ /dev/null @@ -1,25 +0,0 @@ -from typing import List - -from langchain.document_loaders import TextLoader -from llama_index.readers.base import BaseReader -from llama_index.schema import Document - - -class LyzrTxtReader(BaseReader): - def __init__(self) -> None: - None - - def load_data(self, file_path: str, extra_info: dict = None) -> List[Document]: - loader = TextLoader(str(file_path)) - langchain_documents = loader.load() - - documents = [] - for langchain_document in langchain_documents: - doc = Document.from_langchain_format(langchain_document) - - if extra_info is not None: - doc.metadata.update(extra_info) - - documents.append(doc) - - return documents diff --git a/lyzr/utils/webpage_reader.py b/lyzr/utils/webpage_reader.py deleted file mode 100644 index 2a6483d..0000000 --- a/lyzr/utils/webpage_reader.py +++ /dev/null @@ -1,98 +0,0 @@ -import sys -import asyncio -import logging -import warnings -import nest_asyncio -from typing import List, Set -from bs4 import BeautifulSoup, Tag -from typing import List -from llama_index.schema import Document - -IS_IPYKERNEL = "ipykernel_launcher" in sys.argv[0] - -if IS_IPYKERNEL: - nest_asyncio.apply() - -logger = logging.getLogger(__name__) - - -CONTENT_TAGS = [ - "p", - "div", - "span", - "a", - "td", - "tr", - "li", - "article", - "section", - "pre", - "code", - "blockquote", - "em", - "strong", - "b", - "i", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "title", -] - - -def scrape(html: str) -> str: - soup: BeautifulSoup = BeautifulSoup(html, "html.parser") - - content: List[Tag] = soup.find_all(CONTENT_TAGS) - - text_set: Set[str] = set() - - for p in content: - for text in p.stripped_strings: - text_set.add(text) - - return " ".join(text_set) - - -async def async_load_content_using_playwright(url: str) -> str: - - try: - from playwright.async_api import async_playwright - - async with async_playwright() as p: - browser = await p.chromium.launch() - page = await browser.new_page() - await page.goto(url) - html = await page.content() - await browser.close() - return html - - except ImportError: - raise ImportError( - "`playwright` package not found, please install it with " - "`pip install playwright && playwright install`" - ) - -def load_content_using_playwright(url: str) -> str: - return asyncio.get_event_loop().run_until_complete( - async_load_content_using_playwright(url) - ) - -class LyzrWebPageReader: - - def __init__(self) -> None: - pass - - @staticmethod - def load_data(url: str) -> List[Document]: - if IS_IPYKERNEL: - warning_msg = "Running in Google Colab or a Jupyter notebook. Consider using nest_asyncio.apply() to avoid event loop conflicts." - warnings.warn(warning_msg, RuntimeWarning) - - html = load_content_using_playwright(url) - content = scrape(html) - document = Document(text=content, metadata={"url": url}) - return [document] diff --git a/lyzr/utils/website_reader.py b/lyzr/utils/website_reader.py deleted file mode 100644 index 642c1e1..0000000 --- a/lyzr/utils/website_reader.py +++ /dev/null @@ -1,35 +0,0 @@ -import logging -from typing import List - -import requests -from bs4 import BeautifulSoup -from llama_index.schema import Document -from tqdm import tqdm - -from lyzr.utils.webpage_reader import LyzrWebPageReader - -logger = logging.getLogger(__name__) - - -class LyzrWebsiteReader: - def __init__(self): - self.visited_links = set() - - @staticmethod - def load_data(url: str) -> List[Document]: - reqs = requests.get(url) - soup = BeautifulSoup(reqs.text, "html.parser") - - all_urls = set() - for link in soup.find_all("a"): - href = link.get("href") - if href is not None: - all_urls.add(url + href) - - logger.info(f"Total URLs to process: {len(all_urls)}") - web_reader = LyzrWebPageReader() - documents = [] - for u in tqdm(all_urls, desc="Processing URLs"): - documents.extend(web_reader.load_data(u)) - - return documents diff --git a/lyzr/utils/youtube_reader.py b/lyzr/utils/youtube_reader.py deleted file mode 100644 index 300668d..0000000 --- a/lyzr/utils/youtube_reader.py +++ /dev/null @@ -1,21 +0,0 @@ -from typing import List - -from llama_index.readers.base import BaseReader -from llama_index.readers.youtube_transcript import YoutubeTranscriptReader -from llama_index.schema import Document - - -class LyzrYoutubeReader(BaseReader): - def __init__(self) -> None: - try: - from youtube_transcript_api import YouTubeTranscriptApi - except ImportError: - raise ImportError( - "`youtube_transcript_api` package not found, \ - please run `pip install youtube-transcript-api`" - ) - - def load_data(self, urls: List[str]) -> List[Document]: - loader = YoutubeTranscriptReader() - documents = loader.load_data(ytlinks=urls) - return documents diff --git a/setup.py b/setup.py index 6fb1626..7fcb3fc 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="lyzr", - version="0.1.27", + version="0.1.28", author="lyzr", description="", long_description=open("README.md").read(),