Skip to content

Commit 9b0c530

Browse files
Dishant1804arkid15r
andcommitted
Sync www-repopsitories (#2164)
* spelling fixes and tests * sonar and code rabbit suggestions implemented * json chunking and suggestions implemented * code rabbit and sonar qube suggestions * code rabbit suggestions * suggestions implemented * github advance security addressed * tests fixed * fixed tests * Clean up backend/test_commands.py --------- Co-authored-by: Arkadii Yakovets <2201626+arkid15r@users.noreply.github.com> Co-authored-by: Arkadii Yakovets <arkadii.yakovets@owasp.org>
1 parent 9f6ec86 commit 9b0c530

File tree

13 files changed

+1202
-15
lines changed

13 files changed

+1202
-15
lines changed

backend/apps/ai/Makefile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,14 @@ ai-update-project-context:
3434
@echo "Updating project context"
3535
@CMD="python manage.py ai_update_project_context" $(MAKE) exec-backend-command
3636

37+
ai-update-repository-chunks:
38+
@echo "Updating repository chunks"
39+
@CMD="python manage.py ai_update_repository_chunks" $(MAKE) exec-backend-command
40+
41+
ai-update-repository-context:
42+
@echo "Updating repository context"
43+
@CMD="python manage.py ai_update_repository_context" $(MAKE) exec-backend-command
44+
3745
ai-update-slack-message-chunks:
3846
@echo "Updating Slack message chunks"
3947
@CMD="python manage.py ai_update_slack_message_chunks" $(MAKE) exec-backend-command

backend/apps/ai/common/base/chunk_command.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from apps.ai.common.utils import create_chunks_and_embeddings
88
from apps.ai.models.chunk import Chunk
99
from apps.ai.models.context import Context
10+
from apps.common.utils import is_valid_json
1011

1112

1213
class BaseChunkCommand(BaseAICommand):
@@ -43,10 +44,14 @@ def process_chunks_batch(self, entities: list[Model]) -> int:
4344
count, _ = context.chunks.all().delete()
4445
self.stdout.write(f"Deleted {count} stale chunks for {entity_key}")
4546

46-
prose_content, metadata_content = self.extract_content(entity)
47-
full_content = (
48-
f"{metadata_content}\n\n{prose_content}" if metadata_content else prose_content
49-
)
47+
content, metadata_content = self.extract_content(entity)
48+
49+
if is_valid_json(content):
50+
full_content = content
51+
else:
52+
full_content = (
53+
f"{metadata_content}\n\n{content}" if metadata_content else content
54+
)
5055

5156
if not full_content.strip():
5257
self.stdout.write(f"No content to chunk for {self.entity_name} {entity_key}")

backend/apps/ai/common/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44
DEFAULT_CHUNKS_RETRIEVAL_LIMIT = 5
55
DEFAULT_SIMILARITY_THRESHOLD = 0.4
66
DELIMITER = "\n\n"
7+
GITHUB_REQUEST_INTERVAL_SECONDS = 0.5
78
MIN_REQUEST_INTERVAL_SECONDS = 1.2
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
"""Content extractor for Repository."""
2+
3+
import json
4+
import logging
5+
import time
6+
7+
from apps.ai.common.constants import DELIMITER, GITHUB_REQUEST_INTERVAL_SECONDS
8+
from apps.common.utils import is_valid_json
9+
from apps.github.utils import get_repository_file_content
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
def extract_repository_content(repository) -> tuple[str, str]:
15+
"""Extract structured content from repository data.
16+
17+
Args:
18+
repository: Repository instance
19+
20+
Returns:
21+
tuple[str, str]: (json_content, metadata_content)
22+
23+
"""
24+
repository_data = {}
25+
26+
if repository.name:
27+
repository_data["name"] = repository.name
28+
if repository.key:
29+
repository_data["key"] = repository.key
30+
if repository.description:
31+
repository_data["description"] = repository.description
32+
if repository.homepage:
33+
repository_data["homepage"] = repository.homepage
34+
if repository.license:
35+
repository_data["license"] = repository.license
36+
if repository.topics:
37+
repository_data["topics"] = repository.topics
38+
39+
status = {}
40+
if repository.is_archived:
41+
status["archived"] = True
42+
if repository.is_empty:
43+
status["empty"] = True
44+
if repository.is_owasp_repository:
45+
status["owasp_repository"] = True
46+
if repository.is_owasp_site_repository:
47+
status["owasp_site_repository"] = True
48+
if status:
49+
repository_data["status"] = status
50+
51+
funding = {}
52+
if repository.is_funding_policy_compliant:
53+
funding["policy_compliant"] = True
54+
if repository.has_funding_yml:
55+
funding["has_funding_yml"] = True
56+
if funding:
57+
repository_data["funding"] = funding
58+
59+
if repository.pages_status:
60+
repository_data["pages_status"] = repository.pages_status
61+
62+
features = []
63+
if repository.has_downloads:
64+
features.append("downloads")
65+
if repository.has_issues:
66+
features.append("issues")
67+
if repository.has_pages:
68+
features.append("pages")
69+
if repository.has_projects:
70+
features.append("projects")
71+
if repository.has_wiki:
72+
features.append("wiki")
73+
if features:
74+
repository_data["features"] = features
75+
76+
stats = {}
77+
if repository.commits_count:
78+
stats["commits"] = repository.commits_count
79+
if repository.contributors_count:
80+
stats["contributors"] = repository.contributors_count
81+
if repository.forks_count:
82+
stats["forks"] = repository.forks_count
83+
if repository.open_issues_count:
84+
stats["open_issues"] = repository.open_issues_count
85+
if repository.stars_count:
86+
stats["stars"] = repository.stars_count
87+
if repository.subscribers_count:
88+
stats["subscribers"] = repository.subscribers_count
89+
if repository.watchers_count:
90+
stats["watchers"] = repository.watchers_count
91+
if stats:
92+
repository_data["statistics"] = stats
93+
94+
dates = {}
95+
if repository.created_at:
96+
dates["created"] = repository.created_at.strftime("%Y-%m-%d")
97+
if repository.updated_at:
98+
dates["last_updated"] = repository.updated_at.strftime("%Y-%m-%d")
99+
if repository.pushed_at:
100+
dates["last_pushed"] = repository.pushed_at.strftime("%Y-%m-%d")
101+
if dates:
102+
repository_data["dates"] = dates
103+
104+
ownership = {}
105+
if repository.organization:
106+
ownership["organization"] = repository.organization.login
107+
if repository.owner:
108+
ownership["owner"] = repository.owner.login
109+
if ownership:
110+
repository_data["ownership"] = ownership
111+
112+
markdown_files = [
113+
"README.md",
114+
"index.md",
115+
"info.md",
116+
"leaders.md",
117+
]
118+
119+
if repository.organization:
120+
owner = repository.organization.login
121+
else:
122+
owner = repository.owner.login if repository.owner else ""
123+
branch = repository.default_branch or "main"
124+
125+
tab_files = []
126+
if owner and repository.key:
127+
contents_url = (
128+
f"https://api.github.com/repos/{owner}/{repository.key}/contents/?ref={branch}"
129+
)
130+
response = get_repository_file_content(contents_url)
131+
if response and is_valid_json(response):
132+
items = json.loads(response)
133+
for item in items:
134+
name = item.get("name", "")
135+
if name.startswith("tab_") and name.endswith(".md"):
136+
tab_files.append(name)
137+
138+
all_markdown_files = markdown_files + tab_files
139+
140+
markdown_content = {}
141+
for file_path in all_markdown_files:
142+
try:
143+
if owner and repository.key:
144+
raw_url = (
145+
f"https://raw.githubusercontent.com/{owner}/{repository.key}/"
146+
f"{branch}/{file_path}"
147+
)
148+
content = get_repository_file_content(raw_url)
149+
150+
if content and content.strip():
151+
markdown_content[file_path] = content
152+
time.sleep(GITHUB_REQUEST_INTERVAL_SECONDS)
153+
154+
except (ValueError, TypeError, OSError):
155+
logger.debug("Failed to fetch markdown file")
156+
continue
157+
158+
if markdown_content:
159+
repository_data["markdown_content"] = markdown_content
160+
161+
json_content = json.dumps(repository_data, indent=2)
162+
163+
metadata_parts = []
164+
if repository.name:
165+
metadata_parts.append(f"Repository Name: {repository.name}")
166+
if repository.key:
167+
metadata_parts.append(f"Repository Key: {repository.key}")
168+
if repository.organization:
169+
metadata_parts.append(f"Organization: {repository.organization.login}")
170+
if repository.owner:
171+
metadata_parts.append(f"Owner: {repository.owner.login}")
172+
173+
return (
174+
json_content,
175+
DELIMITER.join(filter(None, metadata_parts)),
176+
)
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""A command to create chunks of OWASP repository data for RAG."""
2+
3+
from django.db.models import QuerySet
4+
5+
from apps.ai.common.base.chunk_command import BaseChunkCommand
6+
from apps.ai.common.extractors.repository import extract_repository_content
7+
from apps.github.models.repository import Repository
8+
9+
10+
class Command(BaseChunkCommand):
11+
key_field_name = "key"
12+
model_class = Repository
13+
14+
def __init__(self, *args, **kwargs):
15+
"""Initialize command for repository."""
16+
super().__init__(*args, **kwargs)
17+
self.entity_name_plural = "repositories"
18+
19+
def extract_content(self, entity: Repository) -> tuple[str, str]:
20+
"""Extract content from the repository."""
21+
return extract_repository_content(entity)
22+
23+
def get_base_queryset(self) -> QuerySet:
24+
"""Return the base queryset with filtering for OWASP site repositories."""
25+
return (
26+
super()
27+
.get_base_queryset()
28+
.filter(
29+
is_owasp_site_repository=True,
30+
is_archived=False,
31+
is_empty=False,
32+
)
33+
)
34+
35+
def get_default_queryset(self) -> QuerySet:
36+
"""Override to avoid is_active filter since Repository doesn't have that field."""
37+
return self.get_base_queryset()
38+
39+
def source_name(self) -> str:
40+
"""Return the source name for context creation."""
41+
return "owasp_repository"
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""A command to update context for OWASP repository data."""
2+
3+
from django.db.models import QuerySet
4+
5+
from apps.ai.common.base.context_command import BaseContextCommand
6+
from apps.ai.common.extractors.repository import extract_repository_content
7+
from apps.github.models.repository import Repository
8+
9+
10+
class Command(BaseContextCommand):
11+
key_field_name = "key"
12+
model_class = Repository
13+
14+
def __init__(self, *args, **kwargs):
15+
"""Initialize command for repository."""
16+
super().__init__(*args, **kwargs)
17+
self.entity_name_plural = "repositories"
18+
19+
def extract_content(self, entity: Repository) -> tuple[str, str]:
20+
"""Extract content from the repository."""
21+
return extract_repository_content(entity)
22+
23+
def get_base_queryset(self) -> QuerySet:
24+
"""Return the base queryset with filtering for OWASP site repositories."""
25+
return (
26+
super()
27+
.get_base_queryset()
28+
.filter(
29+
is_owasp_site_repository=True,
30+
is_archived=False,
31+
is_empty=False,
32+
)
33+
)
34+
35+
def get_default_queryset(self) -> QuerySet:
36+
"""Override to avoid is_active filter since Repository doesn't have that field."""
37+
return self.get_base_queryset()
38+
39+
def source_name(self) -> str:
40+
"""Return the source name for context creation."""
41+
return "owasp_repository"

backend/apps/ai/models/chunk.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ def bulk_save(chunks, fields=None):
3535
def split_text(text: str) -> list[str]:
3636
"""Split text into chunks."""
3737
return RecursiveCharacterTextSplitter(
38-
chunk_size=300,
39-
chunk_overlap=40,
38+
chunk_size=500,
39+
chunk_overlap=80,
4040
length_function=len,
4141
separators=["\n\n", "\n", " ", ""],
4242
).split_text(text)

backend/apps/common/utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
import json
56
import re
67
from datetime import UTC, datetime
78
from urllib.parse import urlparse
@@ -102,6 +103,23 @@ def get_user_ip_address(request) -> str:
102103
return x_forwarded_for.split(",")[0] if x_forwarded_for else request.META.get("REMOTE_ADDR")
103104

104105

106+
def is_valid_json(content: str) -> bool:
107+
"""Check if content is JSON format.
108+
109+
Args:
110+
content: The content to check
111+
112+
Returns:
113+
bool: True if content is valid JSON, False otherwise
114+
115+
"""
116+
try:
117+
json.loads(content)
118+
except (TypeError, ValueError):
119+
return False
120+
return True
121+
122+
105123
def join_values(fields: list, delimiter: str = " ") -> str:
106124
"""Join non-empty field values using a specified delimiter.
107125

0 commit comments

Comments
 (0)