Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions backend/apps/ai/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ ai-create-chapter-chunks:
@echo "Creating chapter chunks"
@CMD="python manage.py ai_create_chapter_chunks" $(MAKE) exec-backend-command

ai-create-event-chunks:
@echo "Creating event chunks"
@CMD="python manage.py ai_create_event_chunks" $(MAKE) exec-backend-command

ai-create-slack-message-chunks:
@echo "Creating Slack message chunks"
@CMD="python manage.py ai_create_slack_message_chunks" $(MAKE) exec-backend-command
64 changes: 64 additions & 0 deletions backend/apps/ai/common/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""AI utils."""

import logging
import time
from datetime import UTC, datetime, timedelta

from apps.ai.common.constants import (
DEFAULT_LAST_REQUEST_OFFSET_SECONDS,
MIN_REQUEST_INTERVAL_SECONDS,
)
from apps.ai.models.chunk import Chunk

logger: logging.Logger = logging.getLogger(__name__)


def create_chunks_and_embeddings(
all_chunk_texts: list[str],
content_object,
openai_client,
) -> list[Chunk]:
"""Create chunks and embeddings from given texts using OpenAI embeddings.
Args:
all_chunk_texts (list[str]): List of text chunks to embed.
content_object: The object to associate the chunks with.
openai_client: Initialized OpenAI client instance.
Returns:
list[Chunk]: List of Chunk instances (not saved).
"""
try:
last_request_time = datetime.now(UTC) - timedelta(
seconds=DEFAULT_LAST_REQUEST_OFFSET_SECONDS
)
time_since_last_request = datetime.now(UTC) - last_request_time

if time_since_last_request < timedelta(seconds=MIN_REQUEST_INTERVAL_SECONDS):
time.sleep(MIN_REQUEST_INTERVAL_SECONDS - time_since_last_request.total_seconds())

response = openai_client.embeddings.create(
input=all_chunk_texts,
model="text-embedding-3-small",
)

return [
chunk
for text, embedding in zip(
all_chunk_texts,
[d.embedding for d in response.data],
strict=True,
)
if (
chunk := Chunk.update_data(
text=text,
content_object=content_object,
embedding=embedding,
save=False,
)
)
]
except Exception:
logger.exception("OpenAI API error")
return []
70 changes: 10 additions & 60 deletions backend/apps/ai/management/commands/ai_create_chapter_chunks.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
"""A command to create chunks of OWASP chapter data for RAG."""

import os
import time
from datetime import UTC, datetime, timedelta

import openai
from django.core.management.base import BaseCommand

from apps.ai.common.constants import (
DEFAULT_LAST_REQUEST_OFFSET_SECONDS,
DELIMITER,
MIN_REQUEST_INTERVAL_SECONDS,
)
from apps.ai.common.constants import DELIMITER
from apps.ai.common.utils import create_chunks_and_embeddings
from apps.ai.models.chunk import Chunk
from apps.owasp.models.chapter import Chapter

Expand Down Expand Up @@ -65,15 +60,15 @@ def handle(self, *args, **options):

batch_chunks = []
for chapter in batch_chapters:
batch_chunks.extend(self.create_chunks(chapter))
batch_chunks.extend(self.handle_chunks(chapter))

if batch_chunks:
Chunk.bulk_save(batch_chunks)
self.stdout.write(f"Saved {len(batch_chunks)} chunks")

self.stdout.write(f"Completed processing all {total_chapters} chapters")

def create_chunks(self, chapter: Chapter) -> list[Chunk]:
def handle_chunks(self, chapter: Chapter) -> list[Chunk]:
"""Create chunks from a chapter's data."""
prose_content, metadata_content = self.extract_chapter_content(chapter)

Expand All @@ -89,41 +84,11 @@ def create_chunks(self, chapter: Chapter) -> list[Chunk]:
self.stdout.write(f"No content to chunk for chapter {chapter.key}")
return []

try:
time_since_last_request = datetime.now(UTC) - getattr(
self,
"last_request_time",
datetime.now(UTC) - timedelta(seconds=DEFAULT_LAST_REQUEST_OFFSET_SECONDS),
)

if time_since_last_request < timedelta(seconds=MIN_REQUEST_INTERVAL_SECONDS):
time.sleep(MIN_REQUEST_INTERVAL_SECONDS - time_since_last_request.total_seconds())

response = self.openai_client.embeddings.create(
input=all_chunk_texts,
model="text-embedding-3-small",
)
self.last_request_time = datetime.now(UTC)

return [
chunk
for text, embedding in zip(
all_chunk_texts,
[d.embedding for d in response.data],
strict=True,
)
if (
chunk := Chunk.update_data(
text=text,
content_object=chapter,
embedding=embedding,
save=False,
)
)
]
except openai.OpenAIError as e:
self.stdout.write(self.style.ERROR(f"OpenAI API error for chapter {chapter.key}: {e}"))
return []
return create_chunks_and_embeddings(
all_chunk_texts=all_chunk_texts,
content_object=chapter,
openai_client=self.openai_client,
)

def extract_chapter_content(self, chapter: Chapter) -> tuple[str, str]:
"""Extract and separate prose content from metadata for a chapter.
Expand Down Expand Up @@ -164,9 +129,6 @@ def extract_chapter_content(self, chapter: Chapter) -> tuple[str, str]:
if location_parts:
metadata_parts.append(f"Location Information: {', '.join(location_parts)}")

if chapter.level:
metadata_parts.append(f"Chapter Level: {chapter.level}")

if chapter.currency:
metadata_parts.append(f"Currency: {chapter.currency}")

Expand All @@ -180,19 +142,7 @@ def extract_chapter_content(self, chapter: Chapter) -> tuple[str, str]:
metadata_parts.append(f"Topics: {', '.join(chapter.topics)}")

if chapter.leaders_raw:
leaders_info = []
for leader in chapter.leaders_raw:
if isinstance(leader, dict):
leader_name = leader.get("name", "")
leader_email = leader.get("email", "")
if leader_name:
leader_text = f"Leader: {leader_name}"
if leader_email:
leader_text += f" ({leader_email})"
leaders_info.append(leader_text)

if leaders_info:
metadata_parts.append(f"Chapter Leaders: {', '.join(leaders_info)}")
metadata_parts.append(f"Chapter Leaders: {', '.join(chapter.leaders_raw)}")

if chapter.related_urls:
valid_urls = [
Expand Down
133 changes: 133 additions & 0 deletions backend/apps/ai/management/commands/ai_create_event_chunks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""A command to create chunks of OWASP event data for RAG."""

import os

import openai
from django.core.management.base import BaseCommand

from apps.ai.common.constants import DELIMITER
from apps.ai.common.utils import create_chunks_and_embeddings
from apps.ai.models.chunk import Chunk
from apps.owasp.models.event import Event


class Command(BaseCommand):
help = "Create chunks for OWASP event data"

def add_arguments(self, parser):
parser.add_argument(
"--event",
type=str,
help="Process only the event with this key",
)
parser.add_argument(
"--all",
action="store_true",
help="Process all the events",
)
parser.add_argument(
"--batch-size",
type=int,
default=50,
help="Number of events to process in each batch",
)

def handle(self, *args, **options):
if not (openai_api_key := os.getenv("DJANGO_OPEN_AI_SECRET_KEY")):
self.stdout.write(
self.style.ERROR("DJANGO_OPEN_AI_SECRET_KEY environment variable not set")
)
return

self.openai_client = openai.OpenAI(api_key=openai_api_key)

if event := options["event"]:
queryset = Event.objects.filter(key=event)
elif options["all"]:
queryset = Event.objects.all()
else:
queryset = Event.upcoming_events()

if not (total_events := queryset.count()):
self.stdout.write("No events found to process")
return

self.stdout.write(f"Found {total_events} events to process")

batch_size = options["batch_size"]
for offset in range(0, total_events, batch_size):
batch_events = queryset[offset : offset + batch_size]

batch_chunks = []
for event in batch_events:
batch_chunks.extend(self.handle_chunks(event))

if batch_chunks:
Chunk.bulk_save(batch_chunks)
self.stdout.write(f"Saved {len(batch_chunks)} chunks")

self.stdout.write(f"Completed processing all {total_events} events")

def handle_chunks(self, event: Event) -> list[Chunk]:
"""Create chunks from an event's data."""
prose_content, metadata_content = self.extract_event_content(event)

all_chunk_texts = []

if metadata_content.strip():
all_chunk_texts.append(metadata_content)

if prose_content.strip():
all_chunk_texts.extend(Chunk.split_text(prose_content))

if not all_chunk_texts:
self.stdout.write(f"No content to chunk for event {event.key}")
return []

return create_chunks_and_embeddings(
all_chunk_texts,
content_object=event,
openai_client=self.openai_client,
)

def extract_event_content(self, event: Event) -> tuple[str, str]:
"""Extract and separate prose content from metadata for an event.

Returns:
tuple[str, str]: (prose_content, metadata_content)

"""
prose_parts = []
metadata_parts = []

if event.description:
prose_parts.append(f"Description: {event.description}")

if event.summary:
prose_parts.append(f"Summary: {event.summary}")

if event.name:
metadata_parts.append(f"Event Name: {event.name}")

if event.category:
metadata_parts.append(f"Category: {event.get_category_display()}")

if event.start_date:
metadata_parts.append(f"Start Date: {event.start_date}")

if event.end_date:
metadata_parts.append(f"End Date: {event.end_date}")

if event.suggested_location:
metadata_parts.append(f"Location: {event.suggested_location}")

if event.latitude and event.longitude:
metadata_parts.append(f"Coordinates: {event.latitude}, {event.longitude}")

if event.url:
metadata_parts.append(f"Event URL: {event.url}")

return (
DELIMITER.join(filter(None, prose_parts)),
DELIMITER.join(filter(None, metadata_parts)),
)
Loading