Skip to content

Commit

Permalink
Merge pull request #52 from rmusser01/main
Browse files Browse the repository at this point in the history
BREAKING CHANGE - PRIOR VERSIONS WILL NOT WORK WITH THIS DATABASE
  • Loading branch information
rmusser01 authored Nov 2, 2024
2 parents 05ed3e0 + 600de4e commit 9effd18
Show file tree
Hide file tree
Showing 72 changed files with 10,034 additions and 4,269 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ jobs:
cd ./Tests/RAG
pytest test_RAG_Library_2.py
- name: Test RAG Notes functions with pytest
run: |
pwd
cd ./Tests/RAG_QA_Chat
pytest test_notes_search.py
- name: Test SQLite lib functions with pytest
run: |
pwd
Expand Down
357 changes: 131 additions & 226 deletions App_Function_Libraries/Audio/Audio_Files.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion App_Function_Libraries/Benchmarks_Evaluations/ms_g_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
wait_random_exponential,
)

from App_Function_Libraries.Chat import chat_api_call
from App_Function_Libraries.Chat.Chat_Functions import chat_api_call

#
#######################################################################################################################
Expand Down
172 changes: 83 additions & 89 deletions App_Function_Libraries/Books/Book_Ingestion_Lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,109 +385,103 @@ def process_markdown_content(markdown_content, file_path, title, author, keyword
return f"Document '{title}' imported successfully. Database result: {result}"


def import_file_handler(file,
title,
author,
keywords,
system_prompt,
custom_prompt,
auto_summarize,
api_name,
api_key,
max_chunk_size,
chunk_overlap,
custom_chapter_pattern
):
def import_file_handler(files,
author,
keywords,
system_prompt,
custom_prompt,
auto_summarize,
api_name,
api_key,
max_chunk_size,
chunk_overlap,
custom_chapter_pattern):
try:
log_counter("file_import_attempt", labels={"file_name": file.name})

# Handle max_chunk_size
if isinstance(max_chunk_size, str):
max_chunk_size = int(max_chunk_size) if max_chunk_size.strip() else 4000
elif not isinstance(max_chunk_size, int):
max_chunk_size = 4000 # Default value if not a string or int

# Handle chunk_overlap
if isinstance(chunk_overlap, str):
chunk_overlap = int(chunk_overlap) if chunk_overlap.strip() else 0
elif not isinstance(chunk_overlap, int):
chunk_overlap = 0 # Default value if not a string or int

chunk_options = {
'method': 'chapter',
'max_size': max_chunk_size,
'overlap': chunk_overlap,
'custom_chapter_pattern': custom_chapter_pattern if custom_chapter_pattern else None
}
if not files:
return "No files uploaded."

if file is None:
log_counter("file_import_error", labels={"error": "No file uploaded"})
return "No file uploaded."
# Convert single file to list for consistent processing
if not isinstance(files, list):
files = [files]

file_path = file.name
if not os.path.exists(file_path):
log_counter("file_import_error", labels={"error": "File not found", "file_name": file.name})
return "Uploaded file not found."
results = []
for file in files:
log_counter("file_import_attempt", labels={"file_name": file.name})

start_time = datetime.now()
# Handle max_chunk_size and chunk_overlap
chunk_size = int(max_chunk_size) if isinstance(max_chunk_size, (str, int)) else 4000
overlap = int(chunk_overlap) if isinstance(chunk_overlap, (str, int)) else 0

if file_path.lower().endswith('.epub'):
status = import_epub(
file_path,
title,
author,
keywords,
custom_prompt=custom_prompt,
system_prompt=system_prompt,
summary=None,
auto_summarize=auto_summarize,
api_name=api_name,
api_key=api_key,
chunk_options=chunk_options,
custom_chapter_pattern=custom_chapter_pattern
)
log_counter("epub_import_success", labels={"file_name": file.name})
result = f"📚 EPUB Imported Successfully:\n{status}"
elif file.name.lower().endswith('.zip'):
status = process_zip_file(
zip_file=file,
title=title,
author=author,
keywords=keywords,
custom_prompt=custom_prompt,
system_prompt=system_prompt,
summary=None,
auto_summarize=auto_summarize,
api_name=api_name,
api_key=api_key,
chunk_options=chunk_options
)
log_counter("zip_import_success", labels={"file_name": file.name})
result = f"📦 ZIP Processed Successfully:\n{status}"
elif file.name.lower().endswith(('.chm', '.html', '.pdf', '.xml', '.opml')):
file_type = file.name.split('.')[-1].upper()
log_counter("unsupported_file_type", labels={"file_type": file_type})
result = f"{file_type} file import is not yet supported."
else:
log_counter("unsupported_file_type", labels={"file_type": file.name.split('.')[-1]})
result = "❌ Unsupported file type. Please upload an `.epub` file or a `.zip` file containing `.epub` files."
chunk_options = {
'method': 'chapter',
'max_size': chunk_size,
'overlap': overlap,
'custom_chapter_pattern': custom_chapter_pattern if custom_chapter_pattern else None
}

end_time = datetime.now()
processing_time = (end_time - start_time).total_seconds()
log_histogram("file_import_duration", processing_time, labels={"file_name": file.name})
file_path = file.name
if not os.path.exists(file_path):
results.append(f"❌ File not found: {file.name}")
continue

return result
start_time = datetime.now()

# Extract title from filename
title = os.path.splitext(os.path.basename(file_path))[0]

if file_path.lower().endswith('.epub'):
status = import_epub(
file_path,
title=title, # Use filename as title
author=author,
keywords=keywords,
custom_prompt=custom_prompt,
system_prompt=system_prompt,
summary=None,
auto_summarize=auto_summarize,
api_name=api_name,
api_key=api_key,
chunk_options=chunk_options,
custom_chapter_pattern=custom_chapter_pattern
)
log_counter("epub_import_success", labels={"file_name": file.name})
results.append(f"📚 {file.name}: {status}")

elif file_path.lower().endswith('.zip'):
status = process_zip_file(
zip_file=file,
title=None, # Let each file use its own name
author=author,
keywords=keywords,
custom_prompt=custom_prompt,
system_prompt=system_prompt,
summary=None,
auto_summarize=auto_summarize,
api_name=api_name,
api_key=api_key,
chunk_options=chunk_options
)
log_counter("zip_import_success", labels={"file_name": file.name})
results.append(f"📦 {file.name}: {status}")
else:
results.append(f"❌ Unsupported file type: {file.name}")
continue

end_time = datetime.now()
processing_time = (end_time - start_time).total_seconds()
log_histogram("file_import_duration", processing_time, labels={"file_name": file.name})

return "\n\n".join(results)

except ValueError as ve:
logging.exception(f"Error parsing input values: {str(ve)}")
log_counter("file_import_error", labels={"error": "Invalid input", "file_name": file.name})
return f"❌ Error: Invalid input for chunk size or overlap. Please enter valid numbers."
except Exception as e:
logging.exception(f"Error during file import: {str(e)}")
log_counter("file_import_error", labels={"error": str(e), "file_name": file.name})
return f"❌ Error during import: {str(e)}"



def read_epub(file_path):
"""
Reads and extracts text from an EPUB file.
Expand Down Expand Up @@ -568,9 +562,9 @@ def ingest_text_file(file_path, title=None, author=None, keywords=None):

# Add the text file to the database
add_media_with_keywords(
url=file_path,
url="its_a_book",
title=title,
media_type='document',
media_type='book',
content=content,
keywords=keywords,
prompt='No prompt for text files',
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# Chat.py
# Chat_Functions.py
# Chat functions for interacting with the LLMs as chatbots
import base64
# Imports
import json
import logging
import os
import re
import sqlite3
import tempfile
import time
from datetime import datetime
Expand All @@ -14,7 +15,8 @@
# External Imports
#
# Local Imports
from App_Function_Libraries.DB.DB_Manager import get_conversation_name, save_chat_history_to_database
from App_Function_Libraries.DB.DB_Manager import start_new_conversation, delete_messages_in_conversation, save_message
from App_Function_Libraries.DB.RAG_QA_Chat_DB import get_db_connection, get_conversation_name
from App_Function_Libraries.LLM_API_Calls import chat_with_openai, chat_with_anthropic, chat_with_cohere, \
chat_with_groq, chat_with_openrouter, chat_with_deepseek, chat_with_mistral, chat_with_huggingface
from App_Function_Libraries.LLM_API_Calls_Local import chat_with_aphrodite, chat_with_local_llm, chat_with_ollama, \
Expand All @@ -27,6 +29,16 @@
#
# Functions:

def approximate_token_count(history):
total_text = ''
for user_msg, bot_msg in history:
if user_msg:
total_text += user_msg + ' '
if bot_msg:
total_text += bot_msg + ' '
total_tokens = len(total_text.split())
return total_tokens

def chat_api_call(api_endpoint, api_key, input_data, prompt, temp, system_message=None):
log_counter("chat_api_call_attempt", labels={"api_endpoint": api_endpoint})
start_time = time.time()
Expand Down Expand Up @@ -173,56 +185,58 @@ def save_chat_history_to_db_wrapper(chatbot, conversation_id, media_content, med
log_counter("save_chat_history_to_db_attempt")
start_time = time.time()
logging.info(f"Attempting to save chat history. Media content type: {type(media_content)}")

try:
# Extract the media_id and media_name from the media_content
media_id = None
if isinstance(media_content, dict):
# First check if we can access the database
try:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT 1")
except sqlite3.DatabaseError as db_error:
logging.error(f"Database is corrupted or inaccessible: {str(db_error)}")
return conversation_id, "Database error: The database file appears to be corrupted. Please contact support."

# Now attempt the save
if not conversation_id:
# Only for new conversations, not updates
media_id = None
logging.debug(f"Media content keys: {media_content.keys()}")
if 'content' in media_content:
if isinstance(media_content, dict) and 'content' in media_content:
try:
content = media_content['content']
if isinstance(content, str):
content_json = json.loads(content)
elif isinstance(content, dict):
content_json = content
else:
raise ValueError(f"Unexpected content type: {type(content)}")

# Use the webpage_url as the media_id
content_json = content if isinstance(content, dict) else json.loads(content)
media_id = content_json.get('webpage_url')
# Use the title as the media_name
media_name = content_json.get('title')

logging.info(f"Extracted media_id: {media_id}, media_name: {media_name}")
except json.JSONDecodeError:
logging.error("Failed to decode JSON from media_content['content']")
except Exception as e:
logging.error(f"Error processing media_content: {str(e)}")
media_name = media_name or content_json.get('title', 'Unnamed Media')
except (json.JSONDecodeError, AttributeError) as e:
logging.error(f"Error processing media content: {str(e)}")
media_id = "unknown_media"
media_name = media_name or "Unnamed Media"
else:
logging.warning("'content' key not found in media_content")
else:
logging.warning(f"media_content is not a dictionary. Type: {type(media_content)}")

if media_id is None:
# If we couldn't find a media_id, we'll use a placeholder
media_id = "unknown_media"
logging.warning(f"Unable to extract media_id from media_content. Using placeholder: {media_id}")

if media_name is None:
media_name = "Unnamed Media"
logging.warning(f"Unable to extract media_name from media_content. Using placeholder: {media_name}")
media_id = "unknown_media"
media_name = media_name or "Unnamed Media"

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
conversation_title = f"{media_name}_{timestamp}"
conversation_id = start_new_conversation(title=conversation_title, media_id=media_id)
logging.info(f"Created new conversation with ID: {conversation_id}")

# For both new and existing conversations
try:
delete_messages_in_conversation(conversation_id)
for user_msg, assistant_msg in chatbot:
if user_msg:
save_message(conversation_id, "user", user_msg)
if assistant_msg:
save_message(conversation_id, "assistant", assistant_msg)
except sqlite3.DatabaseError as db_error:
logging.error(f"Database error during message save: {str(db_error)}")
return conversation_id, "Database error: Unable to save messages. Please try again or contact support."

# Generate a unique conversation name using media_id and current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
conversation_name = f"{media_name}_{timestamp}"

new_conversation_id = save_chat_history_to_database(chatbot, conversation_id, media_id, media_name,
conversation_name)
save_duration = time.time() - start_time
log_histogram("save_chat_history_to_db_duration", save_duration)
log_counter("save_chat_history_to_db_success")
return new_conversation_id, f"Chat history saved successfully as {conversation_name}!"

return conversation_id, "Chat history saved successfully!"

except Exception as e:
log_counter("save_chat_history_to_db_error", labels={"error": str(e)})
error_message = f"Failed to save chat history: {str(e)}"
Expand Down
Empty file.
Loading

0 comments on commit 9effd18

Please sign in to comment.