manavgup · manavgup · Oct 6, 2025 · Oct 6, 2025 · Oct 6, 2025 · Oct 6, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -146,6 +146,23 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
+      - name: Free Up Disk Space
+        run: |
+          echo "Freeing up disk space before Docker builds..."
+          df -h
+
+          # Remove unnecessary packages and files (~14GB)
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+
+          # Clean Docker build cache
+          docker system prune -af --volumes || true
+
+          df -h
+          echo "✅ Disk space freed"
+
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
 

diff --git a/.github/workflows/dev-environment-ci.yml b/.github/workflows/dev-environment-ci.yml
@@ -5,10 +5,10 @@ on:
     branches: [ main, develop ]
     paths:
       - '.devcontainer/**'
-      - 'Makefile'
-      - 'docker-compose*.yml'
-      - 'backend/**'
-      - 'tests/**'
+      - 'docker-compose.dev.yml'
+      - 'docker-compose.hotreload.yml'
+      # Removed 'backend/**' and 'tests/**' to prevent duplicate builds
+      # This workflow tests dev container setup, not feature changes
   workflow_dispatch:
 
 jobs:
@@ -131,6 +131,23 @@ jobs:
 
         echo "✅ All volume directories created"
 
+    - name: Free Up Disk Space
+      run: |
+        echo "Freeing up disk space before building images..."
+        df -h
+
+        # Remove unnecessary packages and files
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /opt/ghc
+        sudo rm -rf /usr/local/share/boost
+        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+
+        # Clean Docker build cache
+        docker system prune -af --volumes || true
+
+        df -h
+        echo "✅ Disk space freed"
+
     - name: Build Development Images
       run: |
         echo "Building development Docker images..."

diff --git a/backend/Dockerfile.backend b/backend/Dockerfile.backend
@@ -31,6 +31,14 @@ WORKDIR /app
 # Copy dependency files first for better layer caching
 COPY pyproject.toml poetry.lock ./
 
+# Install CPU-only PyTorch first to avoid CUDA dependencies (~6GB savings)
+# This must be done before Poetry installs docling (which depends on torch)
+# Using torch 2.5.0 to match torchvision 0.20.0 compatibility
+RUN pip install --no-cache-dir \
+    torch==2.5.0+cpu \
+    torchvision==0.20.0+cpu \
+    --index-url https://download.pytorch.org/whl/cpu
+
 # Install python dependencies directly to system Python (no virtual environment)
 # Note: Removed --no-update flag as it's deprecated in Poetry 2.x
 RUN poetry install --only main --no-root --no-cache && \

diff --git a/backend/core/config.py b/backend/core/config.py
@@ -60,6 +60,10 @@ class Settings(BaseSettings):
         str, Field(default="child_with_parent", alias="HIERARCHICAL_RETRIEVAL_MODE")
     ]  # Options: child_only, child_with_parent, full_hierarchy
 
+    # IBM Docling Feature Flags
+    enable_docling: Annotated[bool, Field(default=False, alias="ENABLE_DOCLING")]
+    docling_fallback_enabled: Annotated[bool, Field(default=True, alias="DOCLING_FALLBACK_ENABLED")]
+
     # Chain of Thought (CoT) settings
     cot_max_reasoning_depth: Annotated[int, Field(default=3, alias="COT_MAX_REASONING_DEPTH")]
     cot_reasoning_strategy: Annotated[str, Field(default="decomposition", alias="COT_REASONING_STRATEGY")]

diff --git a/backend/dev_tests/manual/test_docling_debug.py b/backend/dev_tests/manual/test_docling_debug.py
@@ -0,0 +1,133 @@
+"""Debug script to see what Docling extracts from a PDF.
+
+Usage:
+    poetry run python dev_tests/manual/test_docling_debug.py
+"""
+
+from docling.document_converter import DocumentConverter  # type: ignore[import-not-found]
+
+
+def main():
+    """Debug Docling extraction."""
+    pdf_path = "/Users/mg/Downloads/407ETR.pdf"
+
+    print("=" * 80)
+    print("DOCLING DEBUG - Raw Extraction")
+    print("=" * 80)
+    print(f"\n📄 Processing: {pdf_path}\n")
+
+    # Convert with Docling
+    converter = DocumentConverter()
+    result = converter.convert(pdf_path)
+
+    doc = result.document
+
+    print("✅ Document converted successfully")
+    print("\n📋 Document Metadata:")
+    print(f"   - Has metadata attr: {hasattr(doc, 'metadata')}")
+    if hasattr(doc, "metadata"):
+        print(f"   - Metadata: {doc.metadata}")
+
+    print("\n🔍 Document Structure:")
+    print(f"   - Has iterate_items: {hasattr(doc, 'iterate_items')}")
+
+    if hasattr(doc, "iterate_items"):
+        items = list(doc.iterate_items())
+        print(f"   - Total items: {len(items)}")
+
+        if items:
+            print("\n📝 Item Types:")
+            item_types = {}
+            for item in items:
+                item_type = type(item).__name__
+                item_types[item_type] = item_types.get(item_type, 0) + 1
+
+            for item_type, count in item_types.items():
+                print(f"      - {item_type}: {count}")
+
+            print("\n🔎 First 5 items (checking page info):")
+            for i, item_data in enumerate(items[:5]):
+                print(f"\n   --- Item {i+1} ---")
+
+                # Extract actual item from tuple
+                if isinstance(item_data, tuple):
+                    item = item_data[0]
+                    level = item_data[1] if len(item_data) > 1 else None
+                    print(f"   Tuple: (item, level={level})")
+                else:
+                    item = item_data
+                    print("   Direct item")
+
+                print(f"   Type: {type(item).__name__}")
+
+                # Check for text
+                if hasattr(item, "text"):
+                    text = str(item.text)[:80]
+                    print(f"   Text: {text}...")
+
+                # Check for provenance (page info)
+                if hasattr(item, "prov"):
+                    prov = item.prov
+                    print("   Has prov: True")
+                    print(f"   Prov type: {type(prov)}")
+                    print(f"   Prov value: {prov}")
+
+                    # If it's a list, check first element
+                    if isinstance(prov, list) and len(prov) > 0:
+                        print(f"   Prov[0] type: {type(prov[0])}")
+                        print(f"   Prov[0] value: {prov[0]}")
+                        if hasattr(prov[0], "page"):
+                            print(f"   Prov[0].page: {prov[0].page}")
+                        if hasattr(prov[0], "__dict__"):
+                            print(f"   Prov[0] attrs: {prov[0].__dict__}")
+                else:
+                    print("   Has prov: False")
+
+                # Check for page_no attribute directly
+                if hasattr(item, "page_no"):
+                    print(f"   item.page_no: {item.page_no}")
+                if hasattr(item, "page"):
+                    print(f"   item.page: {item.page}")
+                else:
+                    print(f"   Attributes: {dir(item)[:10]}...")  # Show first 10 attrs
+
+                    # Try to get text
+                    if hasattr(item, "text"):
+                        text = item.text[:100] if len(item.text) > 100 else item.text
+                        print(f"   Text: {text}...")
+
+                    # Try to get page
+                    if hasattr(item, "prov"):
+                        print(f"   Provenance: {item.prov}")
+        else:
+            print("   ⚠️  No items found!")
+            print("\n   This could mean:")
+            print("      1. PDF is image-based and needs OCR")
+            print("      2. PDF structure isn't recognized")
+            print("      3. Content is in a different format")
+
+    # Check if we can export to markdown
+    print("\n📄 Export Options:")
+    if hasattr(doc, "export_to_markdown"):
+        print("   - Has export_to_markdown")
+        try:
+            md = doc.export_to_markdown()
+            print(f"   - Markdown length: {len(md)} chars")
+            print(f"   - Markdown preview:\n{md[:500]}")
+        except Exception as e:
+            print(f"   - Export failed: {e}")
+
+    if hasattr(doc, "export_to_text"):
+        print("   - Has export_to_text")
+        try:
+            text = doc.export_to_text()
+            print(f"   - Text length: {len(text)} chars")
+            print(f"   - Text preview:\n{text[:500]}")
+        except Exception as e:
+            print(f"   - Export failed: {e}")
+
+    print("\n" + "=" * 80)
+
+
+if __name__ == "__main__":
+    main()