OWASP
diff --git a/‎backend/apps/ai/Makefile‎
Lines changed: 8 additions & 0 deletions b/‎backend/apps/ai/Makefile‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎backend/apps/ai/common/base/chunk_command.py‎
Lines changed: 9 additions & 4 deletions b/‎backend/apps/ai/common/base/chunk_command.py‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎backend/apps/ai/common/constants.py‎
Lines changed: 1 addition & 0 deletions b/‎backend/apps/ai/common/constants.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backend/apps/ai/common/extractors/repository.py‎
Lines changed: 176 additions & 0 deletions b/‎backend/apps/ai/common/extractors/repository.py‎
Lines changed: 176 additions & 0 deletions
diff --git a/‎backend/apps/ai/management/commands/ai_update_repository_chunks.py‎
Lines changed: 41 additions & 0 deletions b/‎backend/apps/ai/management/commands/ai_update_repository_chunks.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎backend/apps/ai/management/commands/ai_update_repository_context.py‎
Lines changed: 41 additions & 0 deletions b/‎backend/apps/ai/management/commands/ai_update_repository_context.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎backend/apps/ai/models/chunk.py‎
Lines changed: 2 additions & 2 deletions b/‎backend/apps/ai/models/chunk.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backend/apps/common/utils.py‎
Lines changed: 18 additions & 0 deletions b/‎backend/apps/common/utils.py‎
Lines changed: 18 additions & 0 deletions
@@ -34,6 +34,14 @@ ai-update-project-context:
 	@echo "Updating project context"
 	@CMD="python manage.py ai_update_project_context" $(MAKE) exec-backend-command
 
+ai-update-repository-chunks:
+	@echo "Updating repository chunks"
+	@CMD="python manage.py ai_update_repository_chunks" $(MAKE) exec-backend-command
+
+ai-update-repository-context:
+	@echo "Updating repository context"
+	@CMD="python manage.py ai_update_repository_context" $(MAKE) exec-backend-command
+
 ai-update-slack-message-chunks:
 	@echo "Updating Slack message chunks"
 	@CMD="python manage.py ai_update_slack_message_chunks" $(MAKE) exec-backend-command
 
@@ -7,6 +7,7 @@
 from apps.ai.common.utils import create_chunks_and_embeddings
 from apps.ai.models.chunk import Chunk
 from apps.ai.models.context import Context
+from apps.common.utils import is_valid_json
 
 
 class BaseChunkCommand(BaseAICommand):
@@ -43,10 +44,14 @@ def process_chunks_batch(self, entities: list[Model]) -> int:
                     count, _ = context.chunks.all().delete()
                     self.stdout.write(f"Deleted {count} stale chunks for {entity_key}")
 
-                prose_content, metadata_content = self.extract_content(entity)
-                full_content = (
-                    f"{metadata_content}\n\n{prose_content}" if metadata_content else prose_content
-                )
+                content, metadata_content = self.extract_content(entity)
+
+                if is_valid_json(content):
+                    full_content = content
+                else:
+                    full_content = (
+                        f"{metadata_content}\n\n{content}" if metadata_content else content
+                    )
 
                 if not full_content.strip():
                     self.stdout.write(f"No content to chunk for {self.entity_name} {entity_key}")
 
@@ -4,4 +4,5 @@
 DEFAULT_CHUNKS_RETRIEVAL_LIMIT = 5
 DEFAULT_SIMILARITY_THRESHOLD = 0.4
 DELIMITER = "\n\n"
+GITHUB_REQUEST_INTERVAL_SECONDS = 0.5
 MIN_REQUEST_INTERVAL_SECONDS = 1.2
@@ -0,0 +1,176 @@
+"""Content extractor for Repository."""
+
+import json
+import logging
+import time
+
+from apps.ai.common.constants import DELIMITER, GITHUB_REQUEST_INTERVAL_SECONDS
+from apps.common.utils import is_valid_json
+from apps.github.utils import get_repository_file_content
+
+logger = logging.getLogger(__name__)
+
+
+def extract_repository_content(repository) -> tuple[str, str]:
+    """Extract structured content from repository data.
+
+    Args:
+        repository: Repository instance
+
+    Returns:
+        tuple[str, str]: (json_content, metadata_content)
+
+    """
+    repository_data = {}
+
+    if repository.name:
+        repository_data["name"] = repository.name
+    if repository.key:
+        repository_data["key"] = repository.key
+    if repository.description:
+        repository_data["description"] = repository.description
+    if repository.homepage:
+        repository_data["homepage"] = repository.homepage
+    if repository.license:
+        repository_data["license"] = repository.license
+    if repository.topics:
+        repository_data["topics"] = repository.topics
+
+    status = {}
+    if repository.is_archived:
+        status["archived"] = True
+    if repository.is_empty:
+        status["empty"] = True
+    if repository.is_owasp_repository:
+        status["owasp_repository"] = True
+    if repository.is_owasp_site_repository:
+        status["owasp_site_repository"] = True
+    if status:
+        repository_data["status"] = status
+
+    funding = {}
+    if repository.is_funding_policy_compliant:
+        funding["policy_compliant"] = True
+    if repository.has_funding_yml:
+        funding["has_funding_yml"] = True
+    if funding:
+        repository_data["funding"] = funding
+
+    if repository.pages_status:
+        repository_data["pages_status"] = repository.pages_status
+
+    features = []
+    if repository.has_downloads:
+        features.append("downloads")
+    if repository.has_issues:
+        features.append("issues")
+    if repository.has_pages:
+        features.append("pages")
+    if repository.has_projects:
+        features.append("projects")
+    if repository.has_wiki:
+        features.append("wiki")
+    if features:
+        repository_data["features"] = features
+
+    stats = {}
+    if repository.commits_count:
+        stats["commits"] = repository.commits_count
+    if repository.contributors_count:
+        stats["contributors"] = repository.contributors_count
+    if repository.forks_count:
+        stats["forks"] = repository.forks_count
+    if repository.open_issues_count:
+        stats["open_issues"] = repository.open_issues_count
+    if repository.stars_count:
+        stats["stars"] = repository.stars_count
+    if repository.subscribers_count:
+        stats["subscribers"] = repository.subscribers_count
+    if repository.watchers_count:
+        stats["watchers"] = repository.watchers_count
+    if stats:
+        repository_data["statistics"] = stats
+
+    dates = {}
+    if repository.created_at:
+        dates["created"] = repository.created_at.strftime("%Y-%m-%d")
+    if repository.updated_at:
+        dates["last_updated"] = repository.updated_at.strftime("%Y-%m-%d")
+    if repository.pushed_at:
+        dates["last_pushed"] = repository.pushed_at.strftime("%Y-%m-%d")
+    if dates:
+        repository_data["dates"] = dates
+
+    ownership = {}
+    if repository.organization:
+        ownership["organization"] = repository.organization.login
+    if repository.owner:
+        ownership["owner"] = repository.owner.login
+    if ownership:
+        repository_data["ownership"] = ownership
+
+    markdown_files = [
+        "README.md",
+        "index.md",
+        "info.md",
+        "leaders.md",
+    ]
+
+    if repository.organization:
+        owner = repository.organization.login
+    else:
+        owner = repository.owner.login if repository.owner else ""
+    branch = repository.default_branch or "main"
+
+    tab_files = []
+    if owner and repository.key:
+        contents_url = (
+            f"https://api.github.com/repos/{owner}/{repository.key}/contents/?ref={branch}"
+        )
+        response = get_repository_file_content(contents_url)
+        if response and is_valid_json(response):
+            items = json.loads(response)
+            for item in items:
+                name = item.get("name", "")
+                if name.startswith("tab_") and name.endswith(".md"):
+                    tab_files.append(name)
+
+    all_markdown_files = markdown_files + tab_files
+
+    markdown_content = {}
+    for file_path in all_markdown_files:
+        try:
+            if owner and repository.key:
+                raw_url = (
+                    f"https://raw.githubusercontent.com/{owner}/{repository.key}/"
+                    f"{branch}/{file_path}"
+                )
+                content = get_repository_file_content(raw_url)
+
+                if content and content.strip():
+                    markdown_content[file_path] = content
+                time.sleep(GITHUB_REQUEST_INTERVAL_SECONDS)
+
+        except (ValueError, TypeError, OSError):
+            logger.debug("Failed to fetch markdown file")
+            continue
+
+    if markdown_content:
+        repository_data["markdown_content"] = markdown_content
+
+    json_content = json.dumps(repository_data, indent=2)
+
+    metadata_parts = []
+    if repository.name:
+        metadata_parts.append(f"Repository Name: {repository.name}")
+    if repository.key:
+        metadata_parts.append(f"Repository Key: {repository.key}")
+    if repository.organization:
+        metadata_parts.append(f"Organization: {repository.organization.login}")
+    if repository.owner:
+        metadata_parts.append(f"Owner: {repository.owner.login}")
+
+    return (
+        json_content,
+        DELIMITER.join(filter(None, metadata_parts)),
+    )
@@ -0,0 +1,41 @@
+"""A command to create chunks of OWASP repository data for RAG."""
+
+from django.db.models import QuerySet
+
+from apps.ai.common.base.chunk_command import BaseChunkCommand
+from apps.ai.common.extractors.repository import extract_repository_content
+from apps.github.models.repository import Repository
+
+
+class Command(BaseChunkCommand):
+    key_field_name = "key"
+    model_class = Repository
+
+    def __init__(self, *args, **kwargs):
+        """Initialize command for repository."""
+        super().__init__(*args, **kwargs)
+        self.entity_name_plural = "repositories"
+
+    def extract_content(self, entity: Repository) -> tuple[str, str]:
+        """Extract content from the repository."""
+        return extract_repository_content(entity)
+
+    def get_base_queryset(self) -> QuerySet:
+        """Return the base queryset with filtering for OWASP site repositories."""
+        return (
+            super()
+            .get_base_queryset()
+            .filter(
+                is_owasp_site_repository=True,
+                is_archived=False,
+                is_empty=False,
+            )
+        )
+
+    def get_default_queryset(self) -> QuerySet:
+        """Override to avoid is_active filter since Repository doesn't have that field."""
+        return self.get_base_queryset()
+
+    def source_name(self) -> str:
+        """Return the source name for context creation."""
+        return "owasp_repository"
@@ -0,0 +1,41 @@
+"""A command to update context for OWASP repository data."""
+
+from django.db.models import QuerySet
+
+from apps.ai.common.base.context_command import BaseContextCommand
+from apps.ai.common.extractors.repository import extract_repository_content
+from apps.github.models.repository import Repository
+
+
+class Command(BaseContextCommand):
+    key_field_name = "key"
+    model_class = Repository
+
+    def __init__(self, *args, **kwargs):
+        """Initialize command for repository."""
+        super().__init__(*args, **kwargs)
+        self.entity_name_plural = "repositories"
+
+    def extract_content(self, entity: Repository) -> tuple[str, str]:
+        """Extract content from the repository."""
+        return extract_repository_content(entity)
+
+    def get_base_queryset(self) -> QuerySet:
+        """Return the base queryset with filtering for OWASP site repositories."""
+        return (
+            super()
+            .get_base_queryset()
+            .filter(
+                is_owasp_site_repository=True,
+                is_archived=False,
+                is_empty=False,
+            )
+        )
+
+    def get_default_queryset(self) -> QuerySet:
+        """Override to avoid is_active filter since Repository doesn't have that field."""
+        return self.get_base_queryset()
+
+    def source_name(self) -> str:
+        """Return the source name for context creation."""
+        return "owasp_repository"
@@ -35,8 +35,8 @@ def bulk_save(chunks, fields=None):
     def split_text(text: str) -> list[str]:
         """Split text into chunks."""
         return RecursiveCharacterTextSplitter(
-            chunk_size=300,
-            chunk_overlap=40,
+            chunk_size=500,
+            chunk_overlap=80,
             length_function=len,
             separators=["\n\n", "\n", " ", ""],
         ).split_text(text)
 
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import json
 import re
 from datetime import UTC, datetime
 from urllib.parse import urlparse
@@ -102,6 +103,23 @@ def get_user_ip_address(request) -> str:
     return x_forwarded_for.split(",")[0] if x_forwarded_for else request.META.get("REMOTE_ADDR")
 
 
+def is_valid_json(content: str) -> bool:
+    """Check if content is JSON format.
+
+    Args:
+        content: The content to check
+
+    Returns:
+        bool: True if content is valid JSON, False otherwise
+
+    """
+    try:
+        json.loads(content)
+    except (TypeError, ValueError):
+        return False
+    return True
+
+
 def join_values(fields: list, delimiter: str = " ") -> str:
     """Join non-empty field values using a specified delimiter.