Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/load obsidian #14

Merged
merged 5 commits into from
Jun 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions apps/api/.envrc.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
export OPENAI_API_KEY=your-key
export SUPABASE_BASE_URL=https://zwxffmhaivlzfxonxdwq.supabase.co
export SUPABASE_ANON_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Inp3eGZmbWhhaXZsemZ4b254ZHdxIiwicm9sZSI6ImFub24iLCJpYXQiOjE2ODQwNDAyNjcsImV4cCI6MTk5OTYxNjI2N30.97SWP08kPH05hjMyuswjrqHqQ1-dCHcp_LxN1y3MY70
export ELEVENLABS_API_KEY=your-key
export ELEVENLABS_VOICE_ID=EXAVITQu4vr4xnSDxMaL
2 changes: 1 addition & 1 deletion apps/api/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,6 @@ dmypy.json
python.langchain.com/
*vectorstore.pkl

example_data
SOURCE_DOCUMENTS

*.mp3
38 changes: 25 additions & 13 deletions apps/api/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,47 @@

[ref](https://medium.com/@caetanoog/start-your-first-fastapi-server-with-poetry-in-10-minutes-fef90e9604d9)

### langchain chat example
run ingest.sh
## How to run
### Setup environment
First, run virtual environment.
```
$ sh ingest.sh
poetry shell
```
create vectorstore

Second, install requirement packages.
```
poetry install
```

### Load Documents into memory
First, locate your .pdf, .csv, .txt, .obs files in SOURCE_DOCUMENTS
(Write obisidian root path into .obs file to load obsidian)

Second, run ingest script.
```
run ingest.py
python ingest.py
```
start server

### Start FASTAPI server

```
make start
```


### API Document

[Swagger](http://127.0.0.1:9000/docs)
[Redoc](http://127.0.0.1:9000/redoc)


### Commands

`poetry shell` : run poetry virtual env
`poetry install` : install packages
`uvicorn main:app --reload` : start server
### Additional Commands
To install package

```
poetry add <package-name>
```

### Additional Commands
`poetry add <package-name>`

### Env variable
```
Expand Down
7 changes: 7 additions & 0 deletions apps/api/constant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import os

ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))

# Define the folder for storing database
SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/SOURCE_DOCUMENTS"
PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/DB"
74 changes: 44 additions & 30 deletions apps/api/ingest.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,62 @@
"""Load html from files, clean up, split, ingest into Weaviate."""
import os
import pickle
from typing import List

from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import ReadTheDocsLoader
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader, PyPDFLoader, CSVLoader, ObsidianLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS

from constant import SOURCE_DIRECTORY

def load_single_document(file_path: str) -> List[Document]:
# Loads a single document from a file path
if file_path.endswith(".txt"):
loader = TextLoader(file_path, encoding="utf8")
elif file_path.endswith(".pdf"):
loader = PyPDFLoader(file_path)
elif file_path.endswith(".csv"):
loader = CSVLoader(file_path)
elif file_path.endswith(".obs"):
f = open(file_path, 'r')
obsidian_path = f.readline()
print(obsidian_path)
loader = ObsidianLoader(obsidian_path.strip())
docs = loader.load()
print(f"Loaded {len(docs)} documents from {file_path}")
return docs


def load_documents(source_dir: str) -> List[Document]:
# Loads all documents from source documents directory
all_files = os.listdir(source_dir)
docs = []
for file_path in all_files :
if file_path[-4:] in ['.txt', '.pdf', '.csv', '.obs']:
absolute_path = (f"{source_dir}/{file_path}")
docs += load_single_document(absolute_path)

return docs


def ingest_docs():
"""Get documents from web pages."""
loader = ReadTheDocsLoader("python.langchain.com/en/latest/")
raw_documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
documents = text_splitter.split_documents(raw_documents)
print(documents)
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)

# Save vectorstore
with open("vectorstore.pkl", "wb") as f:
pickle.dump(vectorstore, f)

print(f"Loading documents from {SOURCE_DIRECTORY}")
documents = load_documents(SOURCE_DIRECTORY)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
print(f"Loaded {len(documents)} documents from {SOURCE_DIRECTORY}")
print(f"Split into {len(texts)} chunks of text")

def ingest_pdf():
loader = PyPDFLoader("example_data/Jaeyoun_s_CV.pdf")
raw_documents = loader.load_and_split()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
documents = text_splitter.split_documents(raw_documents)
print(documents)
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)
vectorstore = FAISS.from_documents(texts, embeddings)

# Save vectorstore
with open("janot-vectorstore.pkl", "wb") as f:
with open("vectorstore.pkl", "wb") as f:
pickle.dump(vectorstore, f)


if __name__ == "__main__":
# ingest_docs()
ingest_pdf()
ingest_docs()
40 changes: 38 additions & 2 deletions apps/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@
from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect, File, Form, UploadFile
from fastapi.middleware.cors import CORSMiddleware

from fastapi.responses import FileResponse

from fastapi.templating import Jinja2Templates
from langchain.vectorstores import VectorStore
import requests

from callback import QuestionGenCallbackHandler, StreamingLLMCallbackHandler
from query_data import get_chain
Expand Down Expand Up @@ -42,9 +45,9 @@
@app.on_event("startup")
async def startup_event():
logging.info("loading vectorstore")
if not Path("janot_vectorstore.pkl").exists():
if not Path("vectorstore.pkl").exists():
raise ValueError("vectorstore.pkl does not exist, please run ingest.py first")
with open("janot_vectorstore.pkl", "rb") as f:
with open("vectorstore.pkl", "rb") as f:
global vectorstore
vectorstore = pickle.load(f)

Expand All @@ -70,6 +73,39 @@ async def transcriptions(audioData: UploadFile,model: Annotated[str, Form()]):
transcription = openai.Audio.transcribe("whisper-1", audio_file)
return transcription

@app.post("/tts")
async def tts(text: Annotated[str, Form()]):
OUTPUT_FILE = "./tts/speech.mp3"
CHUNK_SIZE = 1024
elevenlabs_api_key = os.environ["ELEVENLABS_API_KEY"]
voice_id = os.environ["ELEVENLABS_VOICE_ID"]
tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
headers = {
"Accept": "application/json",
"xi-api-key": elevenlabs_api_key
}
headers["Content-Type"] = "application/json"

data = {
"text": text,
"model_id": "eleven_monolingual_v1",
"voice_settings": {
"stability": 0,
"similarity_boost":0
}
}

response = requests.post(tts_url, json=data, headers=headers, stream=True)

with open(OUTPUT_FILE, 'wb+') as f:
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
if chunk:
f.write(chunk)

return FileResponse(OUTPUT_FILE, media_type="audio/mp3")



@app.websocket("/chat")
async def websocket_endpoint(websocket: WebSocket):
await websocket.accept()
Expand Down
29 changes: 24 additions & 5 deletions apps/web/src/pages/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import axios from "axios";
export default function Home() {
const [ws, setWs] = useState<WebSocket>(null);
const [isProcessing, setIsProcessing] = useState(false);
const audioRef = useRef(null);


const onTranscribe = async (blob: Blob) => {
if (childRef.current) {
Expand All @@ -28,7 +30,7 @@ export default function Home() {
const { text } = await response.data;
// you must return result from your server in Transcript format
if (childRef.current) {
console.log("finish")
console.log("finish");
childRef.current.finish();
}

Expand Down Expand Up @@ -65,8 +67,6 @@ export default function Home() {
end: () => {},
});

const delay = (ms: number) => new Promise((res) => setTimeout(res, ms));

const listen = () => {
// stt
if (childRef.current) {
Expand All @@ -88,7 +88,7 @@ export default function Home() {
const endpoint = "ws://localhost:9000/chat";
const ws = new WebSocket(endpoint);

ws.onmessage = function (event) {
ws.onmessage = async function (event) {
const messages = document.getElementById("messages");
const data = JSON.parse(event.data);
if (data.sender === "bot") {
Expand All @@ -99,7 +99,6 @@ export default function Home() {
p.innerHTML = "JANOT: ";
div.appendChild(p);
messages.appendChild(div);

} else if (data.type === "stream") {
const p = messages.lastChild.lastChild as HTMLParagraphElement;
if (data.message === "\n") {
Expand All @@ -109,6 +108,25 @@ export default function Home() {
}
} else if (data.type === "info") {
} else if (data.type === "end") {
const p = messages.lastChild.lastChild as HTMLParagraphElement;
const finalText = (p.innerHTML).split("JANOT: ")[1];
const enableTTS = false;
// Call tts
if (enableTTS) {
const formdata = new FormData();
formdata.append("text", finalText);
const url = "http://localhost:9000/tts";
const response = await axios.post(url, formdata, {
headers: { "Content-Type": "multipart/form-data" },
responseType: "blob",
});
const audioURL = URL.createObjectURL(response.data);
const audioElement = audioRef.current;
audioElement.src = audioURL;
audioElement.play();
}


if (childRef.current) {
console.log("end");
childRef.current.end();
Expand Down Expand Up @@ -176,6 +194,7 @@ export default function Home() {
className="overflow-auto text-center text-xl font-thin tracking-tight text-white"
></div>
</div>
<audio ref={audioRef} controls />
</main>
</div>
);
Expand Down