Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,23 @@ jobs:
steps:
- uses: actions/checkout@v4

- name: Free Up Disk Space
run: |
echo "Freeing up disk space before Docker builds..."
df -h

# Remove unnecessary packages and files (~14GB)
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf "$AGENT_TOOLSDIRECTORY"

# Clean Docker build cache
docker system prune -af --volumes || true

df -h
echo "✅ Disk space freed"

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

Expand Down
25 changes: 21 additions & 4 deletions .github/workflows/dev-environment-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ on:
branches: [ main, develop ]
paths:
- '.devcontainer/**'
- 'Makefile'
- 'docker-compose*.yml'
- 'backend/**'
- 'tests/**'
- 'docker-compose.dev.yml'
- 'docker-compose.hotreload.yml'
# Removed 'backend/**' and 'tests/**' to prevent duplicate builds
# This workflow tests dev container setup, not feature changes
workflow_dispatch:

jobs:
Expand Down Expand Up @@ -131,6 +131,23 @@ jobs:

echo "✅ All volume directories created"

- name: Free Up Disk Space
run: |
echo "Freeing up disk space before building images..."
df -h

# Remove unnecessary packages and files
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf "$AGENT_TOOLSDIRECTORY"

# Clean Docker build cache
docker system prune -af --volumes || true

df -h
echo "✅ Disk space freed"

- name: Build Development Images
run: |
echo "Building development Docker images..."
Expand Down
8 changes: 8 additions & 0 deletions backend/Dockerfile.backend
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ WORKDIR /app
# Copy dependency files first for better layer caching
COPY pyproject.toml poetry.lock ./

# Install CPU-only PyTorch first to avoid CUDA dependencies (~6GB savings)
# This must be done before Poetry installs docling (which depends on torch)
# Using torch 2.5.0 to match torchvision 0.20.0 compatibility
RUN pip install --no-cache-dir \
torch==2.5.0+cpu \
torchvision==0.20.0+cpu \
--index-url https://download.pytorch.org/whl/cpu

# Install python dependencies directly to system Python (no virtual environment)
# Note: Removed --no-update flag as it's deprecated in Poetry 2.x
RUN poetry install --only main --no-root --no-cache && \
Expand Down
4 changes: 4 additions & 0 deletions backend/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ class Settings(BaseSettings):
str, Field(default="child_with_parent", alias="HIERARCHICAL_RETRIEVAL_MODE")
] # Options: child_only, child_with_parent, full_hierarchy

# IBM Docling Feature Flags
enable_docling: Annotated[bool, Field(default=False, alias="ENABLE_DOCLING")]
docling_fallback_enabled: Annotated[bool, Field(default=True, alias="DOCLING_FALLBACK_ENABLED")]

# Chain of Thought (CoT) settings
cot_max_reasoning_depth: Annotated[int, Field(default=3, alias="COT_MAX_REASONING_DEPTH")]
cot_reasoning_strategy: Annotated[str, Field(default="decomposition", alias="COT_REASONING_STRATEGY")]
Expand Down
133 changes: 133 additions & 0 deletions backend/dev_tests/manual/test_docling_debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""Debug script to see what Docling extracts from a PDF.

Usage:
poetry run python dev_tests/manual/test_docling_debug.py
"""

from docling.document_converter import DocumentConverter # type: ignore[import-not-found]


def main():
"""Debug Docling extraction."""
pdf_path = "/Users/mg/Downloads/407ETR.pdf"

print("=" * 80)
print("DOCLING DEBUG - Raw Extraction")
print("=" * 80)
print(f"\n📄 Processing: {pdf_path}\n")

# Convert with Docling
converter = DocumentConverter()
result = converter.convert(pdf_path)

doc = result.document

print("✅ Document converted successfully")
print("\n📋 Document Metadata:")
print(f" - Has metadata attr: {hasattr(doc, 'metadata')}")
if hasattr(doc, "metadata"):
print(f" - Metadata: {doc.metadata}")

print("\n🔍 Document Structure:")
print(f" - Has iterate_items: {hasattr(doc, 'iterate_items')}")

if hasattr(doc, "iterate_items"):
items = list(doc.iterate_items())
print(f" - Total items: {len(items)}")

if items:
print("\n📝 Item Types:")
item_types = {}
for item in items:
item_type = type(item).__name__
item_types[item_type] = item_types.get(item_type, 0) + 1

for item_type, count in item_types.items():
print(f" - {item_type}: {count}")

print("\n🔎 First 5 items (checking page info):")
for i, item_data in enumerate(items[:5]):
print(f"\n --- Item {i+1} ---")

# Extract actual item from tuple
if isinstance(item_data, tuple):
item = item_data[0]
level = item_data[1] if len(item_data) > 1 else None
print(f" Tuple: (item, level={level})")
else:
item = item_data
print(" Direct item")

print(f" Type: {type(item).__name__}")

# Check for text
if hasattr(item, "text"):
text = str(item.text)[:80]
print(f" Text: {text}...")

# Check for provenance (page info)
if hasattr(item, "prov"):
prov = item.prov
print(" Has prov: True")
print(f" Prov type: {type(prov)}")
print(f" Prov value: {prov}")

# If it's a list, check first element
if isinstance(prov, list) and len(prov) > 0:
print(f" Prov[0] type: {type(prov[0])}")
print(f" Prov[0] value: {prov[0]}")
if hasattr(prov[0], "page"):
print(f" Prov[0].page: {prov[0].page}")
if hasattr(prov[0], "__dict__"):
print(f" Prov[0] attrs: {prov[0].__dict__}")
else:
print(" Has prov: False")

# Check for page_no attribute directly
if hasattr(item, "page_no"):
print(f" item.page_no: {item.page_no}")
if hasattr(item, "page"):
print(f" item.page: {item.page}")
else:
print(f" Attributes: {dir(item)[:10]}...") # Show first 10 attrs

# Try to get text
if hasattr(item, "text"):
text = item.text[:100] if len(item.text) > 100 else item.text
print(f" Text: {text}...")

# Try to get page
if hasattr(item, "prov"):
print(f" Provenance: {item.prov}")
else:
print(" ⚠️ No items found!")
print("\n This could mean:")
print(" 1. PDF is image-based and needs OCR")
print(" 2. PDF structure isn't recognized")
print(" 3. Content is in a different format")

# Check if we can export to markdown
print("\n📄 Export Options:")
if hasattr(doc, "export_to_markdown"):
print(" - Has export_to_markdown")
try:
md = doc.export_to_markdown()
print(f" - Markdown length: {len(md)} chars")
print(f" - Markdown preview:\n{md[:500]}")
except Exception as e:
print(f" - Export failed: {e}")

if hasattr(doc, "export_to_text"):
print(" - Has export_to_text")
try:
text = doc.export_to_text()
print(f" - Text length: {len(text)} chars")
print(f" - Text preview:\n{text[:500]}")
except Exception as e:
print(f" - Export failed: {e}")

print("\n" + "=" * 80)


if __name__ == "__main__":
main()
Loading
Loading