1+ """A command to create chunks of OWASP event data for RAG."""
2+
3+ import os
4+ import time
5+ from datetime import UTC , datetime , timedelta
6+
7+ import openai
8+ from django .core .management .base import BaseCommand
9+
10+ from apps .ai .common .constants import (
11+ DEFAULT_LAST_REQUEST_OFFSET_SECONDS ,
12+ DELIMITER ,
13+ MIN_REQUEST_INTERVAL_SECONDS ,
14+ )
15+ from apps .ai .models .chunk import Chunk
16+ from apps .owasp .models .event import Event
17+
18+
19+ class Command (BaseCommand ):
20+ help = "Create chunks for OWASP event data"
21+
22+ def add_arguments (self , parser ):
23+ parser .add_argument (
24+ "--event" ,
25+ type = str ,
26+ help = "Process only the event with this key" ,
27+ )
28+ parser .add_argument (
29+ "--all" ,
30+ action = "store_true" ,
31+ help = "Process all the events" ,
32+ )
33+ parser .add_argument (
34+ "--batch-size" ,
35+ type = int ,
36+ default = 50 ,
37+ help = "Number of events to process in each batch" ,
38+ )
39+
40+ def handle (self , * args , ** options ):
41+ if not (openai_api_key := os .getenv ("DJANGO_OPEN_AI_SECRET_KEY" )):
42+ self .stdout .write (
43+ self .style .ERROR ("DJANGO_OPEN_AI_SECRET_KEY environment variable not set" )
44+ )
45+ return
46+
47+ self .openai_client = openai .OpenAI (api_key = openai_api_key )
48+
49+ if event := options ["event" ]:
50+ queryset = Event .objects .filter (key = event )
51+ elif options ["all" ]:
52+ queryset = Event .objects .all ()
53+ else :
54+ queryset = Event .upcoming_events ()
55+
56+ if not (total_events := queryset .count ()):
57+ self .stdout .write ("No events found to process" )
58+ return
59+
60+ self .stdout .write (f"Found { total_events } events to process" )
61+
62+ batch_size = options ["batch_size" ]
63+ for offset in range (0 , total_events , batch_size ):
64+ batch_events = queryset [offset : offset + batch_size ]
65+
66+ batch_chunks = []
67+ for event in batch_events :
68+ batch_chunks .extend (self .create_chunks (event ))
69+
70+ if batch_chunks :
71+ Chunk .bulk_save (batch_chunks )
72+ self .stdout .write (f"Saved { len (batch_chunks )} chunks" )
73+
74+ self .stdout .write (f"Completed processing all { total_events } events" )
75+
76+ def create_chunks (self , event : Event ) -> list [Chunk ]:
77+ """Create chunks from an event's data."""
78+ prose_content , metadata_content = self .extract_event_content (event )
79+
80+ all_chunk_texts = []
81+
82+ if metadata_content .strip ():
83+ all_chunk_texts .append (metadata_content )
84+
85+ if prose_content .strip ():
86+ all_chunk_texts .extend (Chunk .split_text (prose_content ))
87+
88+ if not all_chunk_texts :
89+ self .stdout .write (f"No content to chunk for event { event .key } " )
90+ return []
91+
92+ try :
93+ time_since_last_request = datetime .now (UTC ) - getattr (
94+ self ,
95+ "last_request_time" ,
96+ datetime .now (UTC ) - timedelta (seconds = DEFAULT_LAST_REQUEST_OFFSET_SECONDS ),
97+ )
98+
99+ if time_since_last_request < timedelta (seconds = MIN_REQUEST_INTERVAL_SECONDS ):
100+ time .sleep (MIN_REQUEST_INTERVAL_SECONDS - time_since_last_request .total_seconds ())
101+
102+ response = self .openai_client .embeddings .create (
103+ input = all_chunk_texts ,
104+ model = "text-embedding-3-small" ,
105+ )
106+ self .last_request_time = datetime .now (UTC )
107+
108+ return [
109+ chunk
110+ for text , embedding in zip (
111+ all_chunk_texts ,
112+ [d .embedding for d in response .data ],
113+ strict = True ,
114+ )
115+ if (
116+ chunk := Chunk .update_data (
117+ text = text ,
118+ content_object = event ,
119+ embedding = embedding ,
120+ save = False ,
121+ )
122+ )
123+ ]
124+ except openai .OpenAIError as e :
125+ self .stdout .write (self .style .ERROR (f"OpenAI API error for event { event .key } : { e } " ))
126+ return []
127+
128+ def extract_event_content (self , event : Event ) -> tuple [str , str ]:
129+ """Extract and separate prose content from metadata for an event.
130+
131+ Returns:
132+ tuple[str, str]: (prose_content, metadata_content)
133+
134+ """
135+ prose_parts = []
136+ metadata_parts = []
137+
138+ if event .description :
139+ prose_parts .append (f"Description: { event .description } " )
140+
141+ if event .summary :
142+ prose_parts .append (f"Summary: { event .summary } " )
143+
144+ if event .name :
145+ metadata_parts .append (f"Event Name: { event .name } " )
146+
147+ if event .category :
148+ metadata_parts .append (f"Category: { event .get_category_display ()} " )
149+
150+ if event .start_date :
151+ metadata_parts .append (f"Start Date: { event .start_date } " )
152+
153+ if event .end_date :
154+ metadata_parts .append (f"End Date: { event .end_date } " )
155+
156+ if event .suggested_location :
157+ metadata_parts .append (f"Location: { event .suggested_location } " )
158+
159+ if event .latitude and event .longitude :
160+ metadata_parts .append (f"Coordinates: { event .latitude } , { event .longitude } " )
161+
162+ if event .url :
163+ metadata_parts .append (f"Event URL: { event .url } " )
164+
165+ return (
166+ DELIMITER .join (filter (None , prose_parts )),
167+ DELIMITER .join (filter (None , metadata_parts )),
168+ )
0 commit comments