Skip to content

Commit 570edb2

Browse files
committed
event chunks created
1 parent b8c0bed commit 570edb2

File tree

2 files changed

+172
-0
lines changed

2 files changed

+172
-0
lines changed

backend/apps/ai/Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ ai-create-chapter-chunks:
22
@echo "Creating chapter chunks"
33
@CMD="python manage.py ai_create_chapter_chunks" $(MAKE) exec-backend-command
44

5+
ai-create-event-chunks:
6+
@echo "Creating chapter chunks"
7+
@CMD="python manage.py ai_create_event_chunks" $(MAKE) exec-backend-command
8+
59
ai-create-slack-message-chunks:
610
@echo "Creating Slack message chunks"
711
@CMD="python manage.py ai_create_slack_message_chunks" $(MAKE) exec-backend-command
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
"""A command to create chunks of OWASP event data for RAG."""
2+
3+
import os
4+
import time
5+
from datetime import UTC, datetime, timedelta
6+
7+
import openai
8+
from django.core.management.base import BaseCommand
9+
10+
from apps.ai.common.constants import (
11+
DEFAULT_LAST_REQUEST_OFFSET_SECONDS,
12+
DELIMITER,
13+
MIN_REQUEST_INTERVAL_SECONDS,
14+
)
15+
from apps.ai.models.chunk import Chunk
16+
from apps.owasp.models.event import Event
17+
18+
19+
class Command(BaseCommand):
20+
help = "Create chunks for OWASP event data"
21+
22+
def add_arguments(self, parser):
23+
parser.add_argument(
24+
"--event",
25+
type=str,
26+
help="Process only the event with this key",
27+
)
28+
parser.add_argument(
29+
"--all",
30+
action="store_true",
31+
help="Process all the events",
32+
)
33+
parser.add_argument(
34+
"--batch-size",
35+
type=int,
36+
default=50,
37+
help="Number of events to process in each batch",
38+
)
39+
40+
def handle(self, *args, **options):
41+
if not (openai_api_key := os.getenv("DJANGO_OPEN_AI_SECRET_KEY")):
42+
self.stdout.write(
43+
self.style.ERROR("DJANGO_OPEN_AI_SECRET_KEY environment variable not set")
44+
)
45+
return
46+
47+
self.openai_client = openai.OpenAI(api_key=openai_api_key)
48+
49+
if event := options["event"]:
50+
queryset = Event.objects.filter(key=event)
51+
elif options["all"]:
52+
queryset = Event.objects.all()
53+
else:
54+
queryset = Event.upcoming_events()
55+
56+
if not (total_events := queryset.count()):
57+
self.stdout.write("No events found to process")
58+
return
59+
60+
self.stdout.write(f"Found {total_events} events to process")
61+
62+
batch_size = options["batch_size"]
63+
for offset in range(0, total_events, batch_size):
64+
batch_events = queryset[offset : offset + batch_size]
65+
66+
batch_chunks = []
67+
for event in batch_events:
68+
batch_chunks.extend(self.create_chunks(event))
69+
70+
if batch_chunks:
71+
Chunk.bulk_save(batch_chunks)
72+
self.stdout.write(f"Saved {len(batch_chunks)} chunks")
73+
74+
self.stdout.write(f"Completed processing all {total_events} events")
75+
76+
def create_chunks(self, event: Event) -> list[Chunk]:
77+
"""Create chunks from an event's data."""
78+
prose_content, metadata_content = self.extract_event_content(event)
79+
80+
all_chunk_texts = []
81+
82+
if metadata_content.strip():
83+
all_chunk_texts.append(metadata_content)
84+
85+
if prose_content.strip():
86+
all_chunk_texts.extend(Chunk.split_text(prose_content))
87+
88+
if not all_chunk_texts:
89+
self.stdout.write(f"No content to chunk for event {event.key}")
90+
return []
91+
92+
try:
93+
time_since_last_request = datetime.now(UTC) - getattr(
94+
self,
95+
"last_request_time",
96+
datetime.now(UTC) - timedelta(seconds=DEFAULT_LAST_REQUEST_OFFSET_SECONDS),
97+
)
98+
99+
if time_since_last_request < timedelta(seconds=MIN_REQUEST_INTERVAL_SECONDS):
100+
time.sleep(MIN_REQUEST_INTERVAL_SECONDS - time_since_last_request.total_seconds())
101+
102+
response = self.openai_client.embeddings.create(
103+
input=all_chunk_texts,
104+
model="text-embedding-3-small",
105+
)
106+
self.last_request_time = datetime.now(UTC)
107+
108+
return [
109+
chunk
110+
for text, embedding in zip(
111+
all_chunk_texts,
112+
[d.embedding for d in response.data],
113+
strict=True,
114+
)
115+
if (
116+
chunk := Chunk.update_data(
117+
text=text,
118+
content_object=event,
119+
embedding=embedding,
120+
save=False,
121+
)
122+
)
123+
]
124+
except openai.OpenAIError as e:
125+
self.stdout.write(self.style.ERROR(f"OpenAI API error for event {event.key}: {e}"))
126+
return []
127+
128+
def extract_event_content(self, event: Event) -> tuple[str, str]:
129+
"""Extract and separate prose content from metadata for an event.
130+
131+
Returns:
132+
tuple[str, str]: (prose_content, metadata_content)
133+
134+
"""
135+
prose_parts = []
136+
metadata_parts = []
137+
138+
if event.description:
139+
prose_parts.append(f"Description: {event.description}")
140+
141+
if event.summary:
142+
prose_parts.append(f"Summary: {event.summary}")
143+
144+
if event.name:
145+
metadata_parts.append(f"Event Name: {event.name}")
146+
147+
if event.category:
148+
metadata_parts.append(f"Category: {event.get_category_display()}")
149+
150+
if event.start_date:
151+
metadata_parts.append(f"Start Date: {event.start_date}")
152+
153+
if event.end_date:
154+
metadata_parts.append(f"End Date: {event.end_date}")
155+
156+
if event.suggested_location:
157+
metadata_parts.append(f"Location: {event.suggested_location}")
158+
159+
if event.latitude and event.longitude:
160+
metadata_parts.append(f"Coordinates: {event.latitude}, {event.longitude}")
161+
162+
if event.url:
163+
metadata_parts.append(f"Event URL: {event.url}")
164+
165+
return (
166+
DELIMITER.join(filter(None, prose_parts)),
167+
DELIMITER.join(filter(None, metadata_parts)),
168+
)

0 commit comments

Comments
 (0)