Merge pull request #52 from rmusser01/main

BREAKING CHANGE - PRIOR VERSIONS WILL NOT WORK WITH THIS DATABASE
the-crypt-keeper · Nov 2, 2024 · 9effd18 · 9effd18
2 parents 05ed3e0 + 600de4e
commit 9effd18
Show file tree

Hide file tree

Showing 72 changed files with 10,034 additions and 4,269 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -42,6 +42,12 @@ jobs:
         cd ./Tests/RAG
         pytest test_RAG_Library_2.py
     
+    - name: Test RAG Notes functions with pytest
+      run: |
+        pwd
+        cd ./Tests/RAG_QA_Chat
+        pytest test_notes_search.py
+
     - name: Test SQLite lib functions with pytest
       run: |
         pwd

diff --git a/App_Function_Libraries/Audio/Audio_Files.py b/App_Function_Libraries/Audio/Audio_Files.py
diff --git a/App_Function_Libraries/Benchmarks_Evaluations/ms_g_eval.py b/App_Function_Libraries/Benchmarks_Evaluations/ms_g_eval.py
@@ -24,7 +24,7 @@
     wait_random_exponential,
 )
 
-from App_Function_Libraries.Chat import chat_api_call
+from App_Function_Libraries.Chat.Chat_Functions import chat_api_call
 
 #
 #######################################################################################################################

diff --git a/App_Function_Libraries/Books/Book_Ingestion_Lib.py b/App_Function_Libraries/Books/Book_Ingestion_Lib.py
@@ -385,109 +385,103 @@ def process_markdown_content(markdown_content, file_path, title, author, keyword
     return f"Document '{title}' imported successfully. Database result: {result}"
 
 
-def import_file_handler(file,
-                        title,
-                        author,
-                        keywords,
-                        system_prompt,
-                        custom_prompt,
-                        auto_summarize,
-                        api_name,
-                        api_key,
-                        max_chunk_size,
-                        chunk_overlap,
-                        custom_chapter_pattern
-                        ):
+def import_file_handler(files,
+                       author,
+                       keywords,
+                       system_prompt,
+                       custom_prompt,
+                       auto_summarize,
+                       api_name,
+                       api_key,
+                       max_chunk_size,
+                       chunk_overlap,
+                       custom_chapter_pattern):
     try:
-        log_counter("file_import_attempt", labels={"file_name": file.name})
-
-        # Handle max_chunk_size
-        if isinstance(max_chunk_size, str):
-            max_chunk_size = int(max_chunk_size) if max_chunk_size.strip() else 4000
-        elif not isinstance(max_chunk_size, int):
-            max_chunk_size = 4000  # Default value if not a string or int
-
-        # Handle chunk_overlap
-        if isinstance(chunk_overlap, str):
-            chunk_overlap = int(chunk_overlap) if chunk_overlap.strip() else 0
-        elif not isinstance(chunk_overlap, int):
-            chunk_overlap = 0  # Default value if not a string or int
-
-        chunk_options = {
-            'method': 'chapter',
-            'max_size': max_chunk_size,
-            'overlap': chunk_overlap,
-            'custom_chapter_pattern': custom_chapter_pattern if custom_chapter_pattern else None
-        }
+        if not files:
+            return "No files uploaded."
 
-        if file is None:
-            log_counter("file_import_error", labels={"error": "No file uploaded"})
-            return "No file uploaded."
+        # Convert single file to list for consistent processing
+        if not isinstance(files, list):
+            files = [files]
 
-        file_path = file.name
-        if not os.path.exists(file_path):
-            log_counter("file_import_error", labels={"error": "File not found", "file_name": file.name})
-            return "Uploaded file not found."
+        results = []
+        for file in files:
+            log_counter("file_import_attempt", labels={"file_name": file.name})
 
-        start_time = datetime.now()
+            # Handle max_chunk_size and chunk_overlap
+            chunk_size = int(max_chunk_size) if isinstance(max_chunk_size, (str, int)) else 4000
+            overlap = int(chunk_overlap) if isinstance(chunk_overlap, (str, int)) else 0
 
-        if file_path.lower().endswith('.epub'):
-            status = import_epub(
-                file_path,
-                title,
-                author,
-                keywords,
-                custom_prompt=custom_prompt,
-                system_prompt=system_prompt,
-                summary=None,
-                auto_summarize=auto_summarize,
-                api_name=api_name,
-                api_key=api_key,
-                chunk_options=chunk_options,
-                custom_chapter_pattern=custom_chapter_pattern
-            )
-            log_counter("epub_import_success", labels={"file_name": file.name})
-            result = f"📚 EPUB Imported Successfully:\n{status}"
-        elif file.name.lower().endswith('.zip'):
-            status = process_zip_file(
-                zip_file=file,
-                title=title,
-                author=author,
-                keywords=keywords,
-                custom_prompt=custom_prompt,
-                system_prompt=system_prompt,
-                summary=None,
-                auto_summarize=auto_summarize,
-                api_name=api_name,
-                api_key=api_key,
-                chunk_options=chunk_options
-            )
-            log_counter("zip_import_success", labels={"file_name": file.name})
-            result = f"📦 ZIP Processed Successfully:\n{status}"
-        elif file.name.lower().endswith(('.chm', '.html', '.pdf', '.xml', '.opml')):
-            file_type = file.name.split('.')[-1].upper()
-            log_counter("unsupported_file_type", labels={"file_type": file_type})
-            result = f"{file_type} file import is not yet supported."
-        else:
-            log_counter("unsupported_file_type", labels={"file_type": file.name.split('.')[-1]})
-            result = "❌ Unsupported file type. Please upload an `.epub` file or a `.zip` file containing `.epub` files."
+            chunk_options = {
+                'method': 'chapter',
+                'max_size': chunk_size,
+                'overlap': overlap,
+                'custom_chapter_pattern': custom_chapter_pattern if custom_chapter_pattern else None
+            }
 
-        end_time = datetime.now()
-        processing_time = (end_time - start_time).total_seconds()
-        log_histogram("file_import_duration", processing_time, labels={"file_name": file.name})
+            file_path = file.name
+            if not os.path.exists(file_path):
+                results.append(f"❌ File not found: {file.name}")
+                continue
 
-        return result
+            start_time = datetime.now()
+
+            # Extract title from filename
+            title = os.path.splitext(os.path.basename(file_path))[0]
+
+            if file_path.lower().endswith('.epub'):
+                status = import_epub(
+                    file_path,
+                    title=title,  # Use filename as title
+                    author=author,
+                    keywords=keywords,
+                    custom_prompt=custom_prompt,
+                    system_prompt=system_prompt,
+                    summary=None,
+                    auto_summarize=auto_summarize,
+                    api_name=api_name,
+                    api_key=api_key,
+                    chunk_options=chunk_options,
+                    custom_chapter_pattern=custom_chapter_pattern
+                )
+                log_counter("epub_import_success", labels={"file_name": file.name})
+                results.append(f"📚 {file.name}: {status}")
+
+            elif file_path.lower().endswith('.zip'):
+                status = process_zip_file(
+                    zip_file=file,
+                    title=None,  # Let each file use its own name
+                    author=author,
+                    keywords=keywords,
+                    custom_prompt=custom_prompt,
+                    system_prompt=system_prompt,
+                    summary=None,
+                    auto_summarize=auto_summarize,
+                    api_name=api_name,
+                    api_key=api_key,
+                    chunk_options=chunk_options
+                )
+                log_counter("zip_import_success", labels={"file_name": file.name})
+                results.append(f"📦 {file.name}: {status}")
+            else:
+                results.append(f"❌ Unsupported file type: {file.name}")
+                continue
+
+            end_time = datetime.now()
+            processing_time = (end_time - start_time).total_seconds()
+            log_histogram("file_import_duration", processing_time, labels={"file_name": file.name})
+
+        return "\n\n".join(results)
 
     except ValueError as ve:
         logging.exception(f"Error parsing input values: {str(ve)}")
-        log_counter("file_import_error", labels={"error": "Invalid input", "file_name": file.name})
         return f"❌ Error: Invalid input for chunk size or overlap. Please enter valid numbers."
     except Exception as e:
         logging.exception(f"Error during file import: {str(e)}")
-        log_counter("file_import_error", labels={"error": str(e), "file_name": file.name})
         return f"❌ Error during import: {str(e)}"
 
 
+
 def read_epub(file_path):
     """
     Reads and extracts text from an EPUB file.
@@ -568,9 +562,9 @@ def ingest_text_file(file_path, title=None, author=None, keywords=None):
 
         # Add the text file to the database
         add_media_with_keywords(
-            url=file_path,
+            url="its_a_book",
             title=title,
-            media_type='document',
+            media_type='book',
             content=content,
             keywords=keywords,
             prompt='No prompt for text files',

diff --git a/App_Function_Libraries/Chat.py → ...Function_Libraries/Chat/Chat_Functions.py b/App_Function_Libraries/Chat.py → ...Function_Libraries/Chat/Chat_Functions.py
@@ -1,11 +1,12 @@
-# Chat.py
+# Chat_Functions.py
 # Chat functions for interacting with the LLMs as chatbots
 import base64
 # Imports
 import json
 import logging
 import os
 import re
+import sqlite3
 import tempfile
 import time
 from datetime import datetime
@@ -14,7 +15,8 @@
 # External Imports
 #
 # Local Imports
-from App_Function_Libraries.DB.DB_Manager import get_conversation_name, save_chat_history_to_database
+from App_Function_Libraries.DB.DB_Manager import start_new_conversation, delete_messages_in_conversation, save_message
+from App_Function_Libraries.DB.RAG_QA_Chat_DB import get_db_connection, get_conversation_name
 from App_Function_Libraries.LLM_API_Calls import chat_with_openai, chat_with_anthropic, chat_with_cohere, \
     chat_with_groq, chat_with_openrouter, chat_with_deepseek, chat_with_mistral, chat_with_huggingface
 from App_Function_Libraries.LLM_API_Calls_Local import chat_with_aphrodite, chat_with_local_llm, chat_with_ollama, \
@@ -27,6 +29,16 @@
 #
 # Functions:
 
+def approximate_token_count(history):
+    total_text = ''
+    for user_msg, bot_msg in history:
+        if user_msg:
+            total_text += user_msg + ' '
+        if bot_msg:
+            total_text += bot_msg + ' '
+    total_tokens = len(total_text.split())
+    return total_tokens
+
 def chat_api_call(api_endpoint, api_key, input_data, prompt, temp, system_message=None):
     log_counter("chat_api_call_attempt", labels={"api_endpoint": api_endpoint})
     start_time = time.time()
@@ -173,56 +185,58 @@ def save_chat_history_to_db_wrapper(chatbot, conversation_id, media_content, med
     log_counter("save_chat_history_to_db_attempt")
     start_time = time.time()
     logging.info(f"Attempting to save chat history. Media content type: {type(media_content)}")
+
     try:
-        # Extract the media_id and media_name from the media_content
-        media_id = None
-        if isinstance(media_content, dict):
+        # First check if we can access the database
+        try:
+            with get_db_connection() as conn:
+                cursor = conn.cursor()
+                cursor.execute("SELECT 1")
+        except sqlite3.DatabaseError as db_error:
+            logging.error(f"Database is corrupted or inaccessible: {str(db_error)}")
+            return conversation_id, "Database error: The database file appears to be corrupted. Please contact support."
+
+        # Now attempt the save
+        if not conversation_id:
+            # Only for new conversations, not updates
             media_id = None
-            logging.debug(f"Media content keys: {media_content.keys()}")
-            if 'content' in media_content:
+            if isinstance(media_content, dict) and 'content' in media_content:
                 try:
                     content = media_content['content']
-                    if isinstance(content, str):
-                        content_json = json.loads(content)
-                    elif isinstance(content, dict):
-                        content_json = content
-                    else:
-                        raise ValueError(f"Unexpected content type: {type(content)}")
-
-                    # Use the webpage_url as the media_id
+                    content_json = content if isinstance(content, dict) else json.loads(content)
                     media_id = content_json.get('webpage_url')
-                    # Use the title as the media_name
-                    media_name = content_json.get('title')
-
-                    logging.info(f"Extracted media_id: {media_id}, media_name: {media_name}")
-                except json.JSONDecodeError:
-                    logging.error("Failed to decode JSON from media_content['content']")
-                except Exception as e:
-                    logging.error(f"Error processing media_content: {str(e)}")
+                    media_name = media_name or content_json.get('title', 'Unnamed Media')
+                except (json.JSONDecodeError, AttributeError) as e:
+                    logging.error(f"Error processing media content: {str(e)}")
+                    media_id = "unknown_media"
+                    media_name = media_name or "Unnamed Media"
             else:
-                logging.warning("'content' key not found in media_content")
-        else:
-            logging.warning(f"media_content is not a dictionary. Type: {type(media_content)}")
-
-        if media_id is None:
-            # If we couldn't find a media_id, we'll use a placeholder
-            media_id = "unknown_media"
-            logging.warning(f"Unable to extract media_id from media_content. Using placeholder: {media_id}")
-
-        if media_name is None:
-            media_name = "Unnamed Media"
-            logging.warning(f"Unable to extract media_name from media_content. Using placeholder: {media_name}")
+                media_id = "unknown_media"
+                media_name = media_name or "Unnamed Media"
+
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            conversation_title = f"{media_name}_{timestamp}"
+            conversation_id = start_new_conversation(title=conversation_title, media_id=media_id)
+            logging.info(f"Created new conversation with ID: {conversation_id}")
+
+        # For both new and existing conversations
+        try:
+            delete_messages_in_conversation(conversation_id)
+            for user_msg, assistant_msg in chatbot:
+                if user_msg:
+                    save_message(conversation_id, "user", user_msg)
+                if assistant_msg:
+                    save_message(conversation_id, "assistant", assistant_msg)
+        except sqlite3.DatabaseError as db_error:
+            logging.error(f"Database error during message save: {str(db_error)}")
+            return conversation_id, "Database error: Unable to save messages. Please try again or contact support."
 
-        # Generate a unique conversation name using media_id and current timestamp
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        conversation_name = f"{media_name}_{timestamp}"
-
-        new_conversation_id = save_chat_history_to_database(chatbot, conversation_id, media_id, media_name,
-                                                            conversation_name)
         save_duration = time.time() - start_time
         log_histogram("save_chat_history_to_db_duration", save_duration)
         log_counter("save_chat_history_to_db_success")
-        return new_conversation_id, f"Chat history saved successfully as {conversation_name}!"
+
+        return conversation_id, "Chat history saved successfully!"
+
     except Exception as e:
         log_counter("save_chat_history_to_db_error", labels={"error": str(e)})
         error_message = f"Failed to save chat history: {str(e)}"

diff --git a/App_Function_Libraries/Chat/__init__.py b/App_Function_Libraries/Chat/__init__.py