From 0c64626e7e38e84b18c8af03479e901f70d70bb3 Mon Sep 17 00:00:00 2001 From: Jaeyoun Nam Date: Sun, 4 Jun 2023 01:32:54 +0900 Subject: [PATCH] Feature/load obsidian (#14) * feat: tts using ElevenLabs (not streaming) (#10) * feat: load all files in SOURCE_DOCUMENTS (#11) * feat: update obisidian loader * feat: update README # Conflicts: # apps/api/README.md # apps/api/ingest.py --- apps/api/README.md | 3 ++- apps/api/ingest.py | 21 +++++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/apps/api/README.md b/apps/api/README.md index 322d886..1035c41 100644 --- a/apps/api/README.md +++ b/apps/api/README.md @@ -18,7 +18,8 @@ poetry install ``` ### Load Documents into memory -First, locate your .pdf, .csv, .txt files in SOURCE_DOCUMENTS +First, locate your .pdf, .csv, .txt, .obs files in SOURCE_DOCUMENTS +(Write obisidian root path into .obs file to load obsidian) Second, run ingest script. ``` diff --git a/apps/api/ingest.py b/apps/api/ingest.py index bf200bc..5e00550 100644 --- a/apps/api/ingest.py +++ b/apps/api/ingest.py @@ -4,14 +4,14 @@ from typing import List from langchain.docstore.document import Document -from langchain.document_loaders import TextLoader, PyPDFLoader, CSVLoader +from langchain.document_loaders import TextLoader, PyPDFLoader, CSVLoader, ObsidianLoader from langchain.embeddings import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores.faiss import FAISS from constant import SOURCE_DIRECTORY -def load_single_document(file_path: str) -> Document: +def load_single_document(file_path: str) -> List[Document]: # Loads a single document from a file path if file_path.endswith(".txt"): loader = TextLoader(file_path, encoding="utf8") @@ -19,13 +19,26 @@ def load_single_document(file_path: str) -> Document: loader = PyPDFLoader(file_path) elif file_path.endswith(".csv"): loader = CSVLoader(file_path) - return loader.load()[0] + elif file_path.endswith(".obs"): + f = open(file_path, 'r') + obsidian_path = f.readline() + print(obsidian_path) + loader = ObsidianLoader(obsidian_path.strip()) + docs = loader.load() + print(f"Loaded {len(docs)} documents from {file_path}") + return docs def load_documents(source_dir: str) -> List[Document]: # Loads all documents from source documents directory all_files = os.listdir(source_dir) - return [load_single_document(f"{source_dir}/{file_path}") for file_path in all_files if file_path[-4:] in ['.txt', '.pdf', '.csv'] ] + docs = [] + for file_path in all_files : + if file_path[-4:] in ['.txt', '.pdf', '.csv', '.obs']: + absolute_path = (f"{source_dir}/{file_path}") + docs += load_single_document(absolute_path) + + return docs def ingest_docs():