|
| 1 | +"""Content extractor for Repository.""" |
| 2 | + |
| 3 | +import json |
| 4 | +import logging |
| 5 | +import time |
| 6 | + |
| 7 | +from apps.ai.common.constants import DELIMITER, GITHUB_REQUEST_INTERVAL_SECONDS |
| 8 | +from apps.common.utils import is_valid_json |
| 9 | +from apps.github.utils import get_repository_file_content |
| 10 | + |
| 11 | +logger = logging.getLogger(__name__) |
| 12 | + |
| 13 | + |
| 14 | +def extract_repository_content(repository) -> tuple[str, str]: |
| 15 | + """Extract structured content from repository data. |
| 16 | +
|
| 17 | + Args: |
| 18 | + repository: Repository instance |
| 19 | +
|
| 20 | + Returns: |
| 21 | + tuple[str, str]: (json_content, metadata_content) |
| 22 | +
|
| 23 | + """ |
| 24 | + repository_data = {} |
| 25 | + |
| 26 | + if repository.name: |
| 27 | + repository_data["name"] = repository.name |
| 28 | + if repository.key: |
| 29 | + repository_data["key"] = repository.key |
| 30 | + if repository.description: |
| 31 | + repository_data["description"] = repository.description |
| 32 | + if repository.homepage: |
| 33 | + repository_data["homepage"] = repository.homepage |
| 34 | + if repository.license: |
| 35 | + repository_data["license"] = repository.license |
| 36 | + if repository.topics: |
| 37 | + repository_data["topics"] = repository.topics |
| 38 | + |
| 39 | + status = {} |
| 40 | + if repository.is_archived: |
| 41 | + status["archived"] = True |
| 42 | + if repository.is_empty: |
| 43 | + status["empty"] = True |
| 44 | + if repository.is_owasp_repository: |
| 45 | + status["owasp_repository"] = True |
| 46 | + if repository.is_owasp_site_repository: |
| 47 | + status["owasp_site_repository"] = True |
| 48 | + if status: |
| 49 | + repository_data["status"] = status |
| 50 | + |
| 51 | + funding = {} |
| 52 | + if repository.is_funding_policy_compliant: |
| 53 | + funding["policy_compliant"] = True |
| 54 | + if repository.has_funding_yml: |
| 55 | + funding["has_funding_yml"] = True |
| 56 | + if funding: |
| 57 | + repository_data["funding"] = funding |
| 58 | + |
| 59 | + if repository.pages_status: |
| 60 | + repository_data["pages_status"] = repository.pages_status |
| 61 | + |
| 62 | + features = [] |
| 63 | + if repository.has_downloads: |
| 64 | + features.append("downloads") |
| 65 | + if repository.has_issues: |
| 66 | + features.append("issues") |
| 67 | + if repository.has_pages: |
| 68 | + features.append("pages") |
| 69 | + if repository.has_projects: |
| 70 | + features.append("projects") |
| 71 | + if repository.has_wiki: |
| 72 | + features.append("wiki") |
| 73 | + if features: |
| 74 | + repository_data["features"] = features |
| 75 | + |
| 76 | + stats = {} |
| 77 | + if repository.commits_count: |
| 78 | + stats["commits"] = repository.commits_count |
| 79 | + if repository.contributors_count: |
| 80 | + stats["contributors"] = repository.contributors_count |
| 81 | + if repository.forks_count: |
| 82 | + stats["forks"] = repository.forks_count |
| 83 | + if repository.open_issues_count: |
| 84 | + stats["open_issues"] = repository.open_issues_count |
| 85 | + if repository.stars_count: |
| 86 | + stats["stars"] = repository.stars_count |
| 87 | + if repository.subscribers_count: |
| 88 | + stats["subscribers"] = repository.subscribers_count |
| 89 | + if repository.watchers_count: |
| 90 | + stats["watchers"] = repository.watchers_count |
| 91 | + if stats: |
| 92 | + repository_data["statistics"] = stats |
| 93 | + |
| 94 | + dates = {} |
| 95 | + if repository.created_at: |
| 96 | + dates["created"] = repository.created_at.strftime("%Y-%m-%d") |
| 97 | + if repository.updated_at: |
| 98 | + dates["last_updated"] = repository.updated_at.strftime("%Y-%m-%d") |
| 99 | + if repository.pushed_at: |
| 100 | + dates["last_pushed"] = repository.pushed_at.strftime("%Y-%m-%d") |
| 101 | + if dates: |
| 102 | + repository_data["dates"] = dates |
| 103 | + |
| 104 | + ownership = {} |
| 105 | + if repository.organization: |
| 106 | + ownership["organization"] = repository.organization.login |
| 107 | + if repository.owner: |
| 108 | + ownership["owner"] = repository.owner.login |
| 109 | + if ownership: |
| 110 | + repository_data["ownership"] = ownership |
| 111 | + |
| 112 | + markdown_files = [ |
| 113 | + "README.md", |
| 114 | + "index.md", |
| 115 | + "info.md", |
| 116 | + "leaders.md", |
| 117 | + ] |
| 118 | + |
| 119 | + if repository.organization: |
| 120 | + owner = repository.organization.login |
| 121 | + else: |
| 122 | + owner = repository.owner.login if repository.owner else "" |
| 123 | + branch = repository.default_branch or "main" |
| 124 | + |
| 125 | + tab_files = [] |
| 126 | + if owner and repository.key: |
| 127 | + contents_url = ( |
| 128 | + f"https://api.github.com/repos/{owner}/{repository.key}/contents/?ref={branch}" |
| 129 | + ) |
| 130 | + response = get_repository_file_content(contents_url) |
| 131 | + if response and is_valid_json(response): |
| 132 | + items = json.loads(response) |
| 133 | + for item in items: |
| 134 | + name = item.get("name", "") |
| 135 | + if name.startswith("tab_") and name.endswith(".md"): |
| 136 | + tab_files.append(name) |
| 137 | + |
| 138 | + all_markdown_files = markdown_files + tab_files |
| 139 | + |
| 140 | + markdown_content = {} |
| 141 | + for file_path in all_markdown_files: |
| 142 | + try: |
| 143 | + if owner and repository.key: |
| 144 | + raw_url = ( |
| 145 | + f"https://raw.githubusercontent.com/{owner}/{repository.key}/" |
| 146 | + f"{branch}/{file_path}" |
| 147 | + ) |
| 148 | + content = get_repository_file_content(raw_url) |
| 149 | + |
| 150 | + if content and content.strip(): |
| 151 | + markdown_content[file_path] = content |
| 152 | + time.sleep(GITHUB_REQUEST_INTERVAL_SECONDS) |
| 153 | + |
| 154 | + except (ValueError, TypeError, OSError): |
| 155 | + logger.debug("Failed to fetch markdown file") |
| 156 | + continue |
| 157 | + |
| 158 | + if markdown_content: |
| 159 | + repository_data["markdown_content"] = markdown_content |
| 160 | + |
| 161 | + json_content = json.dumps(repository_data, indent=2) |
| 162 | + |
| 163 | + metadata_parts = [] |
| 164 | + if repository.name: |
| 165 | + metadata_parts.append(f"Repository Name: {repository.name}") |
| 166 | + if repository.key: |
| 167 | + metadata_parts.append(f"Repository Key: {repository.key}") |
| 168 | + if repository.organization: |
| 169 | + metadata_parts.append(f"Organization: {repository.organization.login}") |
| 170 | + if repository.owner: |
| 171 | + metadata_parts.append(f"Owner: {repository.owner.login}") |
| 172 | + |
| 173 | + return ( |
| 174 | + json_content, |
| 175 | + DELIMITER.join(filter(None, metadata_parts)), |
| 176 | + ) |
0 commit comments