Skip to content

Commit

Permalink
Merge pull request #281 from rmusser01/dev
Browse files Browse the repository at this point in the history
Fixes + RAG Enhancements
  • Loading branch information
rmusser01 authored Sep 27, 2024
2 parents ff73357 + fb051e7 commit ad4f64c
Show file tree
Hide file tree
Showing 17 changed files with 762 additions and 350 deletions.
Binary file modified .gitignore
Binary file not shown.
6 changes: 3 additions & 3 deletions App_Function_Libraries/Audio/Audio_Transcription_Lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@

class WhisperModel(OriginalWhisperModel):
tldw_dir = os.path.dirname(os.path.dirname(__file__))
default_download_root = os.path.join(tldw_dir, 'App_Function_Libraries', 'models', 'Whisper')
default_download_root = os.path.join(tldw_dir, 'models', 'Whisper')

valid_model_sizes = [
"tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
Expand Down Expand Up @@ -207,8 +207,8 @@ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='me

try:
_, file_ending = os.path.splitext(audio_file_path)
out_file = audio_file_path.replace(file_ending, ".segments.json")
prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
out_file = audio_file_path.replace(file_ending, "-whisper_model-"+whisper_model+".segments.json")
prettified_out_file = audio_file_path.replace(file_ending, "-whisper_model-"+whisper_model+".segments_pretty.json")
if os.path.exists(out_file):
logging.info("speech-to-text: Segments file already exists: %s", out_file)
with open(out_file) as f:
Expand Down
6 changes: 3 additions & 3 deletions App_Function_Libraries/Chunk_Lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,22 +476,22 @@ def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100, unit='
#
# Embedding Chunking

def chunk_for_embedding(text: str, file_name: str, full_summary: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
def chunk_for_embedding(text: str, file_name: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
options = chunk_options.copy()
if custom_chunk_options:
options.update(custom_chunk_options)

logging.info(f"Chunking options: {options}")
chunks = improved_chunking_process(text, options)
total_chunks = len(chunks)
logging.info(f"Total chunks created: {total_chunks}")

chunked_text_with_headers = []
for i, chunk in enumerate(chunks, 1):
chunk_text = chunk['text']
chunk_position = determine_chunk_position(chunk['metadata']['relative_position'])

chunk_header = f"""
Original Document: {file_name}
Full Document Summary: {full_summary or "Full document summary not available."}
Chunk: {i} of {total_chunks}
Position: {chunk_position}
Expand Down
16 changes: 13 additions & 3 deletions App_Function_Libraries/DB/DB_Manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
check_media_and_whisper_model as sqlite_check_media_and_whisper_model, \
create_document_version as sqlite_create_document_version,
get_document_version as sqlite_get_document_version, sqlite_search_db, add_media_chunk as sqlite_add_media_chunk,
sqlite_update_fts_for_media, sqlite_get_unprocessed_media, fetch_item_details as sqlite_fetch_item_details, \
sqlite_update_fts_for_media, get_unprocessed_media as sqlite_get_unprocessed_media, fetch_item_details as sqlite_fetch_item_details, \
search_media_database as sqlite_search_media_database, mark_as_trash as sqlite_mark_as_trash, \
get_media_transcripts as sqlite_get_media_transcripts, get_specific_transcript as sqlite_get_specific_transcript, \
get_media_summaries as sqlite_get_media_summaries, get_specific_summary as sqlite_get_specific_summary, \
Expand All @@ -68,7 +68,7 @@
get_workflow_chat as sqlite_get_workflow_chat, update_media_content_with_version as sqlite_update_media_content_with_version, \
check_existing_media as sqlite_check_existing_media, get_all_document_versions as sqlite_get_all_document_versions, \
fetch_paginated_data as sqlite_fetch_paginated_data, get_latest_transcription as sqlite_get_latest_transcription, \

mark_media_as_processed as sqlite_mark_media_as_processed,
)
#
# Local Imports
Expand Down Expand Up @@ -417,7 +417,7 @@ def update_fts_for_media(media_id: int):
raise ValueError(f"Unsupported database type: {db_type}")


def get_unprocessed_media():
def get_unprocessed_media(*args, **kwargs):
if db_type == 'sqlite':
return sqlite_get_unprocessed_media(db)
elif db_type == 'elasticsearch':
Expand All @@ -427,6 +427,16 @@ def get_unprocessed_media():
raise ValueError(f"Unsupported database type: {db_type}")


def mark_media_as_processed(*args, **kwargs):
if db_type == 'sqlite':
return sqlite_mark_media_as_processed(*args, **kwargs)
elif db_type == 'elasticsearch':
# Implement Elasticsearch version
raise NotImplementedError("Elasticsearch version of mark_media_as_processed not yet implemented")
else:
raise ValueError(f"Unsupported database type: {db_type}")


#
# End of DB-Ingestion functions
############################################################################################################
Expand Down
45 changes: 38 additions & 7 deletions App_Function_Libraries/DB/SQLite_DB.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,8 @@ def create_tables(db) -> None:
is_trash BOOLEAN DEFAULT 0,
trash_date DATETIME,
vector_embedding BLOB,
chunking_status TEXT DEFAULT 'pending'
chunking_status TEXT DEFAULT 'pending',
vector_processing INTEGER DEFAULT 0
)
''',
'''
Expand Down Expand Up @@ -564,11 +565,14 @@ def sqlite_update_fts_for_media(db, media_id: int):
conn.commit()


def sqlite_get_unprocessed_media(db):
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT id, content, type FROM Media WHERE id NOT IN (SELECT DISTINCT media_id FROM MediaChunks)")
return cursor.fetchall()
def get_unprocessed_media(db):
query = """
SELECT id, content, type, COALESCE(title, '') as file_name
FROM Media
WHERE vector_processing = 0
ORDER BY id
"""
return db.execute_query(query)

def get_next_media_id():
try:
Expand All @@ -580,8 +584,18 @@ def get_next_media_id():
finally:
conn.close()


def mark_media_as_processed(database, media_id):
try:
query = "UPDATE Media SET vector_processing = 1 WHERE id = ?"
database.execute_query(query, (media_id,))
logger.info(f"Marked media_id {media_id} as processed")
except Exception as e:
logger.error(f"Error marking media_id {media_id} as processed: {str(e)}")
raise

#
# End of Media-related Functions
# End of Vector-chunk-related Functions
#######################################################################################################################


Expand Down Expand Up @@ -2896,6 +2910,23 @@ def update_media_table(db):
# Add chunking_status column if it doesn't exist
add_missing_column_if_not_exists(db, 'Media', 'chunking_status', "TEXT DEFAULT 'pending'")

# Vector check FIXME/Delete later
def alter_media_table(db):
alter_query = '''
ALTER TABLE Media ADD COLUMN vector_processing INTEGER DEFAULT 0
'''
try:
db.execute_query(alter_query)
logging.info("Media table altered successfully to include vector_processing column.")
except Exception as e:
logging.error(f"Error altering Media table: {str(e)}")
# If the column already exists, SQLite will throw an error, which we can safely ignore
if "duplicate column name" not in str(e).lower():
raise

# Vector check FIXME/Delete later
alter_media_table(db)

#
# End of Functions to manage media chunks
#######################################################################################################################
Expand Down
2 changes: 1 addition & 1 deletion App_Function_Libraries/Gradio_UI/Chat_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def create_chat_interface():
value="You are a helpful AI assitant",
lines=3,
visible=False)
with gr.Column():
with gr.Column(scale=2):
chatbot = gr.Chatbot(height=600, elem_classes="chatbot-container")
msg = gr.Textbox(label="Enter your message")
submit = gr.Button("Submit")
Expand Down
73 changes: 47 additions & 26 deletions App_Function_Libraries/Gradio_UI/Embeddings_tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,17 @@
#
# External Imports
import gradio as gr
from tqdm import tqdm

from App_Function_Libraries.Chunk_Lib import improved_chunking_process, determine_chunk_position
from App_Function_Libraries.Chunk_Lib import improved_chunking_process, chunk_for_embedding
#
# Local Imports
from App_Function_Libraries.DB.DB_Manager import get_all_content_from_database
from App_Function_Libraries.RAG.ChromaDB_Library import chroma_client, \
store_in_chroma
from App_Function_Libraries.RAG.Embeddings_Create import create_embedding
store_in_chroma, situate_context
from App_Function_Libraries.RAG.Embeddings_Create import create_embedding, create_embeddings_batch


#
########################################################################################################################
#
Expand Down Expand Up @@ -174,17 +177,23 @@ def create_view_embeddings_tab():
value="words"
)
max_chunk_size = gr.Slider(
minimum=1, maximum=8000, step=1, value=500,
minimum=1, maximum=8000, step=5, value=500,
label="Max Chunk Size"
)
chunk_overlap = gr.Slider(
minimum=0, maximum=5000, step=1, value=200,
minimum=0, maximum=5000, step=5, value=200,
label="Chunk Overlap"
)
adaptive_chunking = gr.Checkbox(
label="Use Adaptive Chunking",
value=False
)
contextual_api_choice = gr.Dropdown(
choices=["Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral", "OpenRouter", "Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM", "ollama", "HuggingFace"],
label="Select API for Contextualized Embeddings",
value="OpenAI"
)
contextual_api_key = gr.Textbox(label="API Key", lines=1)

def get_items_with_embedding_status():
try:
Expand Down Expand Up @@ -242,7 +251,7 @@ def check_embedding_status(selected_item, item_mapping):
logging.error(f"Error in check_embedding_status: {str(e)}")
return f"Error processing item: {selected_item}. Details: {str(e)}", "", ""

def create_new_embedding_for_item(selected_item, provider, model, api_url, method, max_size, overlap, adaptive, item_mapping):
def create_new_embedding_for_item(selected_item, provider, model, api_url, method, max_size, overlap, adaptive, item_mapping, contextual_api_choice=None):
if not selected_item:
return "Please select an item", "", ""

Expand All @@ -263,31 +272,30 @@ def create_new_embedding_for_item(selected_item, provider, model, api_url, metho
'adaptive': adaptive
}

chunks = improved_chunking_process(item['content'], chunk_options)
logging.info(f"Chunking content for item: {item['title']} (ID: {item_id})")
chunks = chunk_for_embedding(item['content'], item['title'], chunk_options)
collection_name = "all_content_embeddings"
collection = chroma_client.get_or_create_collection(name=collection_name)

# Delete existing embeddings for this item
existing_ids = [f"doc_{item_id}_chunk_{i}" for i in range(len(chunks))]
collection.delete(ids=existing_ids)
logging.info(f"Deleted {len(existing_ids)} existing embeddings for item {item_id}")

for i, chunk in enumerate(chunks):
texts, ids, metadatas = [], [], []
chunk_count = 0
logging.info("Generating contextual summaries and preparing chunks for embedding")
for i, chunk in tqdm(enumerate(chunks), total=len(chunks), desc="Processing chunks"):
chunk_text = chunk['text']
chunk_metadata = chunk['metadata']
chunk_position = determine_chunk_position(chunk_metadata['relative_position'])

chunk_header = f"""
Original Document: {item['title']}
Chunk: {i + 1} of {len(chunks)}
Position: {chunk_position}
Header: {chunk_metadata.get('header_text', 'N/A')}
--- Chunk Content ---
"""

full_chunk_text = chunk_header + chunk_text
if chunk_count == 0:
chunk_count = 1
# Generate contextual summary
logging.debug(f"Generating contextual summary for chunk {chunk_count}")
context = situate_context(contextual_api_choice, item['content'], chunk_text)
contextualized_text = f"{chunk_text}\n\nContextual Summary: {context}"

chunk_id = f"doc_{item_id}_chunk_{i}"
embedding = create_embedding(full_chunk_text, provider, model, api_url)
metadata = {
"media_id": str(item_id),
"chunk_index": i,
Expand All @@ -298,13 +306,26 @@ def create_new_embedding_for_item(selected_item, provider, model, api_url, metho
"adaptive_chunking": adaptive,
"embedding_model": model,
"embedding_provider": provider,
"original_text": chunk_text,
"contextual_summary": context,
**chunk_metadata
}
store_in_chroma(collection_name, [full_chunk_text], [embedding], [chunk_id], [metadata])

embedding_preview = str(embedding[:50])
status = f"New embeddings created and stored for item: {item['title']} (ID: {item_id})"
return status, f"First 50 elements of new embedding:\n{embedding_preview}", json.dumps(metadata, indent=2)
texts.append(contextualized_text)
ids.append(chunk_id)
metadatas.append(metadata)
chunk_count = chunk_count+1

# Create embeddings in batch
logging.info(f"Creating embeddings for {len(texts)} chunks")
embeddings = create_embeddings_batch(texts, provider, model, api_url)

# Store in Chroma
store_in_chroma(collection_name, texts, embeddings, ids, metadatas)

embedding_preview = str(embeddings[0][:50]) if embeddings else "No embeddings created"
status = f"New contextual embeddings created and stored for item: {item['title']} (ID: {item_id})"
return status, f"First 50 elements of new embedding:\n{embedding_preview}", json.dumps(metadatas[0], indent=2)
except Exception as e:
logging.error(f"Error in create_new_embedding_for_item: {str(e)}")
return f"Error creating embedding: {str(e)}", "", ""
Expand All @@ -321,7 +342,7 @@ def create_new_embedding_for_item(selected_item, provider, model, api_url, metho
create_new_embedding_button.click(
create_new_embedding_for_item,
inputs=[item_dropdown, embedding_provider, embedding_model, embedding_api_url,
chunking_method, max_chunk_size, chunk_overlap, adaptive_chunking, item_mapping],
chunking_method, max_chunk_size, chunk_overlap, adaptive_chunking, item_mapping, contextual_api_choice],
outputs=[embedding_status, embedding_preview, embedding_metadata]
)
embedding_provider.change(
Expand Down
13 changes: 7 additions & 6 deletions App_Function_Libraries/Gradio_UI/RAG_QA_Chat_tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from App_Function_Libraries.Books.Book_Ingestion_Lib import read_epub
from App_Function_Libraries.DB.DB_Manager import DatabaseError, get_paginated_files, add_media_with_keywords
from App_Function_Libraries.PDF.PDF_Ingestion_Lib import extract_text_and_format_from_pdf
from App_Function_Libraries.RAG.RAG_Libary_2 import generate_answer
from App_Function_Libraries.RAG.RAG_Libary_2 import generate_answer, enhanced_rag_pipeline
from App_Function_Libraries.RAG.RAG_QA_Chat import search_database, rag_qa_chat
# Eventually... FIXME
from App_Function_Libraries.RAG.RAG_QA_Chat import load_chat_history, save_chat_history
Expand All @@ -31,9 +31,9 @@ def create_rag_qa_chat_tab():
with gr.Row():
with gr.Column(scale=1):
context_source = gr.Radio(
["Existing File", "Search Database", "Upload File"],
["All Files in the Database", "Search Database", "Upload File"],
label="Context Source",
value="Existing File"
value="All Files in the Database"
)
existing_file = gr.Dropdown(label="Select Existing File", choices=[], interactive=True)
file_page = gr.State(value=1)
Expand Down Expand Up @@ -127,9 +127,10 @@ def rag_qa_chat_wrapper(message, history, context_source, existing_file, search_
rephrased_question = message
logging.info(f"First question, no rephrasing: {message}")

if context_source == "Existing File":
context = f"media_id:{existing_file.split('(ID: ')[1][:-1]}"
logging.info(f"Using existing file with context: {context}")
if context_source == "All Files in the Database":
# Use the enhanced_rag_pipeline to search the entire database
context = enhanced_rag_pipeline(rephrased_question, api_choice)
logging.info(f"Using enhanced_rag_pipeline for database search")
elif context_source == "Search Database":
context = f"media_id:{search_results.split('(ID: ')[1][:-1]}"
logging.info(f"Using search result with context: {context}")
Expand Down
4 changes: 3 additions & 1 deletion App_Function_Libraries/Gradio_UI/Search_Tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ def update_content_for_version(selected_item, item_mapping, selected_version):
return "", "", ""

def format_as_html(content, title):
if content is None:
content = "No content available"
escaped_content = html.escape(content)
formatted_content = escaped_content.replace('\n', '<br>')
return f"""
Expand All @@ -79,9 +81,9 @@ def format_as_html(content, title):

def create_search_tab():
with gr.TabItem("Search / Detailed View"):
gr.Markdown("# Search across all ingested items in the Database")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("# Search across all ingested items in the Database")
gr.Markdown("by Title / URL / Keyword / or Content via SQLite Full-Text-Search")
search_query_input = gr.Textbox(label="Search Query", placeholder="Enter your search query here...")
search_type_input = gr.Radio(choices=["Title", "URL", "Keyword", "Content"], value="Title",
Expand Down
Loading

0 comments on commit ad4f64c

Please sign in to comment.