Skip to content

Commit f529179

Browse files
committed
resuable code
1 parent f67a88b commit f529179

File tree

4 files changed

+82
-133
lines changed

4 files changed

+82
-133
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import time
2+
from datetime import UTC, datetime, timedelta
3+
4+
from apps.ai.models.chunk import Chunk
5+
from apps.ai.common.constants import (
6+
DEFAULT_LAST_REQUEST_OFFSET_SECONDS,
7+
MIN_REQUEST_INTERVAL_SECONDS,
8+
)
9+
10+
def create_chunks_and_embeddings(
11+
all_chunk_texts: list[str],
12+
content_object,
13+
openai_client,
14+
) -> list[Chunk]:
15+
"""
16+
Creates chunks and embeddings from given texts using OpenAI embeddings.
17+
18+
Args:
19+
all_chunk_texts (list[str]): List of text chunks to embed.
20+
content_object: The object to associate the chunks with.
21+
openai_client: Initialized OpenAI client instance.
22+
23+
Returns:
24+
list[Chunk]: List of Chunk instances (not saved).
25+
"""
26+
try:
27+
last_request_time = datetime.now(UTC) - timedelta(seconds=DEFAULT_LAST_REQUEST_OFFSET_SECONDS)
28+
time_since_last_request = datetime.now(UTC) - last_request_time
29+
30+
if time_since_last_request < timedelta(seconds=MIN_REQUEST_INTERVAL_SECONDS):
31+
time.sleep(MIN_REQUEST_INTERVAL_SECONDS - time_since_last_request.total_seconds())
32+
33+
response = openai_client.embeddings.create(
34+
input=all_chunk_texts,
35+
model="text-embedding-3-small",
36+
)
37+
38+
return [
39+
chunk
40+
for text, embedding in zip(
41+
all_chunk_texts,
42+
[d.embedding for d in response.data],
43+
strict=True,
44+
)
45+
if (
46+
chunk := Chunk.update_data(
47+
text=text,
48+
content_object=content_object,
49+
embedding=embedding,
50+
save=False,
51+
)
52+
)
53+
]
54+
except Exception as e:
55+
print(f"OpenAI API error: {e}")
56+
return []

backend/apps/ai/management/commands/ai_create_chapter_chunks.py

Lines changed: 9 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,12 @@
11
"""A command to create chunks of OWASP chapter data for RAG."""
22

33
import os
4-
import time
5-
from datetime import UTC, datetime, timedelta
64

75
import openai
86
from django.core.management.base import BaseCommand
97

10-
from apps.ai.common.constants import (
11-
DEFAULT_LAST_REQUEST_OFFSET_SECONDS,
12-
DELIMITER,
13-
MIN_REQUEST_INTERVAL_SECONDS,
14-
)
8+
from apps.ai.common.constants import DELIMITER
9+
from apps.ai.common.create_chunks_and_embeddings import create_chunks_and_embeddings
1510
from apps.ai.models.chunk import Chunk
1611
from apps.owasp.models.chapter import Chapter
1712

@@ -65,15 +60,15 @@ def handle(self, *args, **options):
6560

6661
batch_chunks = []
6762
for chapter in batch_chapters:
68-
batch_chunks.extend(self.create_chunks(chapter))
63+
batch_chunks.extend(self.handle_chunks(chapter))
6964

7065
if batch_chunks:
7166
Chunk.bulk_save(batch_chunks)
7267
self.stdout.write(f"Saved {len(batch_chunks)} chunks")
7368

7469
self.stdout.write(f"Completed processing all {total_chapters} chapters")
7570

76-
def create_chunks(self, chapter: Chapter) -> list[Chunk]:
71+
def handle_chunks(self, chapter: Chapter) -> list[Chunk]:
7772
"""Create chunks from a chapter's data."""
7873
prose_content, metadata_content = self.extract_chapter_content(chapter)
7974

@@ -89,41 +84,11 @@ def create_chunks(self, chapter: Chapter) -> list[Chunk]:
8984
self.stdout.write(f"No content to chunk for chapter {chapter.key}")
9085
return []
9186

92-
try:
93-
time_since_last_request = datetime.now(UTC) - getattr(
94-
self,
95-
"last_request_time",
96-
datetime.now(UTC) - timedelta(seconds=DEFAULT_LAST_REQUEST_OFFSET_SECONDS),
97-
)
98-
99-
if time_since_last_request < timedelta(seconds=MIN_REQUEST_INTERVAL_SECONDS):
100-
time.sleep(MIN_REQUEST_INTERVAL_SECONDS - time_since_last_request.total_seconds())
101-
102-
response = self.openai_client.embeddings.create(
103-
input=all_chunk_texts,
104-
model="text-embedding-3-small",
105-
)
106-
self.last_request_time = datetime.now(UTC)
107-
108-
return [
109-
chunk
110-
for text, embedding in zip(
111-
all_chunk_texts,
112-
[d.embedding for d in response.data],
113-
strict=True,
114-
)
115-
if (
116-
chunk := Chunk.update_data(
117-
text=text,
118-
content_object=chapter,
119-
embedding=embedding,
120-
save=False,
121-
)
122-
)
123-
]
124-
except openai.OpenAIError as e:
125-
self.stdout.write(self.style.ERROR(f"OpenAI API error for chapter {chapter.key}: {e}"))
126-
return []
87+
return create_chunks_and_embeddings(
88+
all_chunk_texts=all_chunk_texts,
89+
content_object=chapter,
90+
openai_client=self.openai_client,
91+
)
12792

12893
def extract_chapter_content(self, chapter: Chapter) -> tuple[str, str]:
12994
"""Extract and separate prose content from metadata for a chapter.

backend/apps/ai/management/commands/ai_create_event_chunks.py

Lines changed: 9 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,12 @@
11
"""A command to create chunks of OWASP event data for RAG."""
22

33
import os
4-
import time
5-
from datetime import UTC, datetime, timedelta
64

75
import openai
86
from django.core.management.base import BaseCommand
97

10-
from apps.ai.common.constants import (
11-
DEFAULT_LAST_REQUEST_OFFSET_SECONDS,
12-
DELIMITER,
13-
MIN_REQUEST_INTERVAL_SECONDS,
14-
)
8+
from apps.ai.common.constants import DELIMITER
9+
from apps.ai.common.create_chunks_and_embeddings import create_chunks_and_embeddings
1510
from apps.ai.models.chunk import Chunk
1611
from apps.owasp.models.event import Event
1712

@@ -65,15 +60,15 @@ def handle(self, *args, **options):
6560

6661
batch_chunks = []
6762
for event in batch_events:
68-
batch_chunks.extend(self.create_chunks(event))
63+
batch_chunks.extend(self.handle_chunks(event))
6964

7065
if batch_chunks:
7166
Chunk.bulk_save(batch_chunks)
7267
self.stdout.write(f"Saved {len(batch_chunks)} chunks")
7368

7469
self.stdout.write(f"Completed processing all {total_events} events")
7570

76-
def create_chunks(self, event: Event) -> list[Chunk]:
71+
def handle_chunks(self, event: Event) -> list[Chunk]:
7772
"""Create chunks from an event's data."""
7873
prose_content, metadata_content = self.extract_event_content(event)
7974

@@ -89,41 +84,11 @@ def create_chunks(self, event: Event) -> list[Chunk]:
8984
self.stdout.write(f"No content to chunk for event {event.key}")
9085
return []
9186

92-
try:
93-
time_since_last_request = datetime.now(UTC) - getattr(
94-
self,
95-
"last_request_time",
96-
datetime.now(UTC) - timedelta(seconds=DEFAULT_LAST_REQUEST_OFFSET_SECONDS),
97-
)
98-
99-
if time_since_last_request < timedelta(seconds=MIN_REQUEST_INTERVAL_SECONDS):
100-
time.sleep(MIN_REQUEST_INTERVAL_SECONDS - time_since_last_request.total_seconds())
101-
102-
response = self.openai_client.embeddings.create(
103-
input=all_chunk_texts,
104-
model="text-embedding-3-small",
105-
)
106-
self.last_request_time = datetime.now(UTC)
107-
108-
return [
109-
chunk
110-
for text, embedding in zip(
111-
all_chunk_texts,
112-
[d.embedding for d in response.data],
113-
strict=True,
114-
)
115-
if (
116-
chunk := Chunk.update_data(
117-
text=text,
118-
content_object=event,
119-
embedding=embedding,
120-
save=False,
121-
)
122-
)
123-
]
124-
except openai.OpenAIError as e:
125-
self.stdout.write(self.style.ERROR(f"OpenAI API error for event {event.key}: {e}"))
126-
return []
87+
return create_chunks_and_embeddings(
88+
all_chunk_texts,
89+
content_object=event,
90+
openai_client=self.openai_client,
91+
)
12792

12893
def extract_event_content(self, event: Event) -> tuple[str, str]:
12994
"""Extract and separate prose content from metadata for an event.
Lines changed: 8 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,11 @@
11
"""A command to create chunks of Slack messages."""
22

33
import os
4-
import time
5-
from datetime import UTC, datetime, timedelta
64

75
import openai
86
from django.core.management.base import BaseCommand
97

10-
from apps.ai.common.constants import (
11-
DEFAULT_LAST_REQUEST_OFFSET_SECONDS,
12-
MIN_REQUEST_INTERVAL_SECONDS,
13-
)
8+
from apps.ai.common.create_chunks_and_embeddings import create_chunks_and_embeddings
149
from apps.ai.models.chunk import Chunk
1510
from apps.slack.models.message import Message
1611

@@ -36,13 +31,13 @@ def handle(self, *args, **options):
3631
[
3732
chunk
3833
for message in Message.objects.all()[offset : offset + batch_size]
39-
for chunk in self.create_chunks(message)
34+
for chunk in self.handle_chunks(message)
4035
]
4136
)
4237

4338
self.stdout.write(f"Completed processing all {total_messages} messages")
4439

45-
def create_chunks(self, message: Message) -> list[Chunk]:
40+
def handle_chunks(self, message: Message) -> list[Chunk]:
4641
"""Create chunks from a message."""
4742
if message.subtype in {"channel_join", "channel_leave"}:
4843
return []
@@ -54,40 +49,8 @@ def create_chunks(self, message: Message) -> list[Chunk]:
5449
)
5550
return []
5651

57-
try:
58-
time_since_last_request = datetime.now(UTC) - getattr(
59-
self,
60-
"last_request_time",
61-
datetime.now(UTC) - timedelta(seconds=DEFAULT_LAST_REQUEST_OFFSET_SECONDS),
62-
)
63-
64-
if time_since_last_request < timedelta(seconds=MIN_REQUEST_INTERVAL_SECONDS):
65-
time.sleep(MIN_REQUEST_INTERVAL_SECONDS - time_since_last_request.total_seconds())
66-
67-
response = self.openai_client.embeddings.create(
68-
input=chunk_text,
69-
model="text-embedding-3-small",
70-
)
71-
self.last_request_time = datetime.now(UTC)
72-
73-
return [
74-
chunk
75-
for text, embedding in zip(
76-
chunk_text,
77-
[d.embedding for d in response.data], # Embedding data from OpenAI response.
78-
strict=True,
79-
)
80-
if (
81-
chunk := Chunk.update_data(
82-
text=text,
83-
content_object=message,
84-
embedding=embedding,
85-
save=False,
86-
)
87-
)
88-
]
89-
except openai.OpenAIError as e:
90-
self.stdout.write(
91-
self.style.ERROR(f"OpenAI API error for message {message.slack_message_id}: {e}")
92-
)
93-
return []
52+
return create_chunks_and_embeddings(
53+
all_chunk_texts=chunk_text,
54+
content_object=message,
55+
openai_client=self.openai_client,
56+
)

0 commit comments

Comments
 (0)