diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 17e6628a..ed498934 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: test: runs-on: ${{ matrix.os }} strategy: - fail-fast: true + fail-fast: false matrix: os: [ubuntu-latest, macos-latest, windows-latest] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] diff --git a/src/server/models.py b/src/server/models.py index 533da611..97739416 100644 --- a/src/server/models.py +++ b/src/server/models.py @@ -116,6 +116,25 @@ class IngestErrorResponse(BaseModel): IngestResponse = Union[IngestSuccessResponse, IngestErrorResponse] +class S3Metadata(BaseModel): + """Model for S3 metadata structure. + + Attributes + ---------- + summary : str + Summary of the ingestion process including token estimates. + tree : str + File tree structure of the repository. + content : str + Processed content from the repository files. + + """ + + summary: str = Field(..., description="Ingestion summary with token estimates") + tree: str = Field(..., description="File tree structure") + content: str = Field(..., description="Processed file content") + + class QueryForm(BaseModel): """Form data for the query. diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 172330ac..7a55bfd1 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -2,19 +2,211 @@ from __future__ import annotations +import logging from pathlib import Path -from typing import cast +from typing import TYPE_CHECKING, cast from gitingest.clone import clone_repo from gitingest.ingestion import ingest_query from gitingest.query_parser import parse_remote_repo -from gitingest.utils.git_utils import validate_github_token +from gitingest.utils.git_utils import resolve_commit, validate_github_token from gitingest.utils.pattern_utils import process_patterns -from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType -from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3 +from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType, S3Metadata +from server.s3_utils import ( + _build_s3_url, + check_s3_object_exists, + generate_s3_file_path, + get_metadata_from_s3, + is_s3_enabled, + upload_metadata_to_s3, + upload_to_s3, +) from server.server_config import MAX_DISPLAY_SIZE from server.server_utils import Colors +if TYPE_CHECKING: + from gitingest.schemas.cloning import CloneConfig + from gitingest.schemas.ingestion import IngestionQuery + +logger = logging.getLogger(__name__) + + +async def _check_s3_cache( + query: IngestionQuery, + input_text: str, + max_file_size: int, + pattern_type: str, + pattern: str, + token: str | None, +) -> IngestSuccessResponse | None: + """Check if digest already exists on S3 and return response if found. + + Parameters + ---------- + query : IngestionQuery + The parsed query object. + input_text : str + Original input text. + max_file_size : int + Maximum file size in KB. + pattern_type : str + Pattern type (include/exclude). + pattern : str + Pattern string. + token : str | None + GitHub token. + + Returns + ------- + IngestSuccessResponse | None + Response if file exists on S3, None otherwise. + + """ + if not is_s3_enabled(): + return None + + try: + # Use git ls-remote to get commit SHA without cloning + clone_config = query.extract_clone_config() + query.commit = await resolve_commit(clone_config, token=token) + # Generate S3 file path using the resolved commit + s3_file_path = generate_s3_file_path( + source=query.url, + user_name=cast("str", query.user_name), + repo_name=cast("str", query.repo_name), + commit=query.commit, + include_patterns=query.include_patterns, + ignore_patterns=query.ignore_patterns, + ) + + # Check if file exists on S3 + if check_s3_object_exists(s3_file_path): + # File exists on S3, serve it directly without cloning + s3_url = _build_s3_url(s3_file_path) + query.s3_url = s3_url + + short_repo_url = f"{query.user_name}/{query.repo_name}" + + # Try to get cached metadata + metadata = get_metadata_from_s3(s3_file_path) + + if metadata: + # Use cached metadata if available + summary = metadata.summary + tree = metadata.tree + content = metadata.content + else: + # Fallback to placeholder messages if metadata not available + summary = "Digest served from cache (S3). Download the full digest to see content details." + tree = "Digest served from cache. Download the full digest to see the file tree." + content = "Digest served from cache. Download the full digest to see the content." + + return IngestSuccessResponse( + repo_url=input_text, + short_repo_url=short_repo_url, + summary=summary, + digest_url=s3_url, + tree=tree, + content=content, + default_max_file_size=max_file_size, + pattern_type=pattern_type, + pattern=pattern, + ) + except Exception as exc: + # Log the exception but don't fail the entire request + logger.warning("S3 cache check failed, falling back to normal cloning: %s", exc) + + return None + + +def _store_digest_content( + query: IngestionQuery, + clone_config: CloneConfig, + digest_content: str, + summary: str, + tree: str, + content: str, +) -> None: + """Store digest content either to S3 or locally based on configuration. + + Parameters + ---------- + query : IngestionQuery + The query object containing repository information. + clone_config : CloneConfig + The clone configuration object. + digest_content : str + The complete digest content to store. + summary : str + The summary content for metadata. + tree : str + The tree content for metadata. + content : str + The file content for metadata. + + """ + if is_s3_enabled(): + # Upload to S3 instead of storing locally + s3_file_path = generate_s3_file_path( + source=query.url, + user_name=cast("str", query.user_name), + repo_name=cast("str", query.repo_name), + commit=query.commit, + include_patterns=query.include_patterns, + ignore_patterns=query.ignore_patterns, + ) + s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id) + + # Also upload metadata JSON for caching + metadata = S3Metadata( + summary=summary, + tree=tree, + content=content, + ) + try: + upload_metadata_to_s3(metadata=metadata, s3_file_path=s3_file_path, ingest_id=query.id) + logger.debug("Successfully uploaded metadata to S3") + except Exception as metadata_exc: + # Log the error but don't fail the entire request + logger.warning("Failed to upload metadata to S3: %s", metadata_exc) + + # Store S3 URL in query for later use + query.s3_url = s3_url + else: + # Store locally + local_txt_file = Path(clone_config.local_path).with_suffix(".txt") + with local_txt_file.open("w", encoding="utf-8") as f: + f.write(digest_content) + + +def _generate_digest_url(query: IngestionQuery) -> str: + """Generate the digest URL based on S3 configuration. + + Parameters + ---------- + query : IngestionQuery + The query object containing repository information. + + Returns + ------- + str + The digest URL. + + Raises + ------ + RuntimeError + If S3 is enabled but no S3 URL was generated. + + """ + if is_s3_enabled(): + digest_url = getattr(query, "s3_url", None) + if not digest_url: + # This should not happen if S3 upload was successful + msg = "S3 is enabled but no S3 URL was generated" + raise RuntimeError(msg) + return digest_url + return f"/api/download/file/{query.id}" + async def process_query( input_text: str, @@ -69,10 +261,22 @@ async def process_query( include_patterns=pattern if pattern_type == PatternType.INCLUDE else None, ) + # Check if digest already exists on S3 before cloning + s3_response = await _check_s3_cache( + query=query, + input_text=input_text, + max_file_size=max_file_size, + pattern_type=pattern_type.value, + pattern=pattern, + token=token, + ) + if s3_response: + return s3_response + clone_config = query.extract_clone_config() await clone_repo(clone_config, token=token) - short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "/" for the page title + short_repo_url = f"{query.user_name}/{query.repo_name}" # The commit hash should always be available at this point if not query.commit: @@ -81,30 +285,8 @@ async def process_query( try: summary, tree, content = ingest_query(query) - - # Prepare the digest content (tree + content) digest_content = tree + "\n" + content - - # Store digest based on S3 configuration - if is_s3_enabled(): - # Upload to S3 instead of storing locally - s3_file_path = generate_s3_file_path( - source=query.url, - user_name=cast("str", query.user_name), - repo_name=cast("str", query.repo_name), - commit=query.commit, - include_patterns=query.include_patterns, - ignore_patterns=query.ignore_patterns, - ) - s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id) - # Store S3 URL in query for later use - query.s3_url = s3_url - else: - # Store locally - local_txt_file = Path(clone_config.local_path).with_suffix(".txt") - with local_txt_file.open("w", encoding="utf-8") as f: - f.write(digest_content) - + _store_digest_content(query, clone_config, digest_content, summary, tree, content) except Exception as exc: _print_error(query.url, exc, max_file_size, pattern_type, pattern) return IngestErrorResponse(error=str(exc)) @@ -123,15 +305,7 @@ async def process_query( summary=summary, ) - # Generate digest_url based on S3 configuration - if is_s3_enabled(): - digest_url = getattr(query, "s3_url", None) - if not digest_url: - # This should not happen if S3 upload was successful - msg = "S3 is enabled but no S3 URL was generated" - raise RuntimeError(msg) - else: - digest_url = f"/api/download/file/{query.id}" + digest_url = _generate_digest_url(query) return IngestSuccessResponse( repo_url=input_text, diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index a30a957f..f7f14ad0 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -11,13 +11,21 @@ import boto3 from botocore.exceptions import ClientError +from prometheus_client import Counter + +from server.models import S3Metadata if TYPE_CHECKING: from botocore.client import BaseClient + # Initialize logger for this module logger = logging.getLogger(__name__) +_cache_lookup_counter = Counter("gitingest_cache_lookup", "Number of cache lookups", ["url"]) +_cache_hit_counter = Counter("gitingest_cache_hit", "Number of cache hits", ["url"]) +_cache_miss_counter = Counter("gitingest_cache_miss", "Number of cache misses", ["url"]) + class S3UploadError(Exception): """Custom exception for S3 upload failures.""" @@ -231,6 +239,149 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: return public_url +def upload_metadata_to_s3(metadata: S3Metadata, s3_file_path: str, ingest_id: UUID) -> str: + """Upload metadata JSON to S3 alongside the digest file. + + Parameters + ---------- + metadata : S3Metadata + The metadata struct containing summary, tree, and content. + s3_file_path : str + The S3 file path for the digest (metadata will use .json extension). + ingest_id : UUID + The ingest ID to store as an S3 object tag. + + Returns + ------- + str + Public URL to access the uploaded metadata file. + + Raises + ------ + ValueError + If S3 is not enabled. + S3UploadError + If the upload to S3 fails. + + """ + if not is_s3_enabled(): + msg = "S3 is not enabled" + logger.error(msg) + raise ValueError(msg) + + # Generate metadata file path by replacing .txt with .json + metadata_file_path = s3_file_path.replace(".txt", ".json") + + s3_client = create_s3_client() + bucket_name = get_s3_bucket_name() + + extra_fields = { + "bucket_name": bucket_name, + "metadata_file_path": metadata_file_path, + "ingest_id": str(ingest_id), + "metadata_size": len(metadata.model_dump_json()), + } + + # Log upload attempt + logger.debug("Starting S3 metadata upload", extra=extra_fields) + + try: + # Upload the metadata with ingest_id as tag + s3_client.put_object( + Bucket=bucket_name, + Key=metadata_file_path, + Body=metadata.model_dump_json(indent=2).encode("utf-8"), + ContentType="application/json", + Tagging=f"ingest_id={ingest_id!s}", + ) + except ClientError as err: + # Log upload failure + logger.exception( + "S3 metadata upload failed", + extra={ + "bucket_name": bucket_name, + "metadata_file_path": metadata_file_path, + "ingest_id": str(ingest_id), + "error_code": err.response.get("Error", {}).get("Code"), + "error_message": str(err), + }, + ) + msg = f"Failed to upload metadata to S3: {err}" + raise S3UploadError(msg) from err + + # Generate public URL + alias_host = get_s3_alias_host() + if alias_host: + # Use alias host if configured + public_url = f"{alias_host.rstrip('/')}/{metadata_file_path}" + else: + # Fallback to direct S3 URL + endpoint = get_s3_config().get("endpoint_url") + if endpoint: + public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{metadata_file_path}" + else: + public_url = ( + f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{metadata_file_path}" + ) + + # Log successful upload + logger.debug( + "S3 metadata upload completed successfully", + extra={ + "bucket_name": bucket_name, + "metadata_file_path": metadata_file_path, + "ingest_id": str(ingest_id), + "public_url": public_url, + }, + ) + + return public_url + + +def get_metadata_from_s3(s3_file_path: str) -> S3Metadata | None: + """Retrieve metadata JSON from S3. + + Parameters + ---------- + s3_file_path : str + The S3 file path for the digest (metadata will use .json extension). + + Returns + ------- + S3Metadata | None + The metadata struct if found, None otherwise. + + """ + if not is_s3_enabled(): + return None + + # Generate metadata file path by replacing .txt with .json + metadata_file_path = s3_file_path.replace(".txt", ".json") + + try: + s3_client = create_s3_client() + bucket_name = get_s3_bucket_name() + + # Get the metadata object + response = s3_client.get_object(Bucket=bucket_name, Key=metadata_file_path) + metadata_content = response["Body"].read().decode("utf-8") + + return S3Metadata.model_validate_json(metadata_content) + except ClientError as err: + # Object doesn't exist if we get a 404 error + error_code = err.response.get("Error", {}).get("Code") + if error_code == "404": + logger.debug("Metadata file not found: %s", metadata_file_path) + return None + # Log other errors but don't fail + logger.warning("Failed to retrieve metadata from S3: %s", err) + return None + except Exception as exc: + # For any other exception, log and return None + logger.warning("Unexpected error retrieving metadata from S3: %s", exc) + return None + + def _build_s3_url(key: str) -> str: """Build S3 URL for a given key.""" alias_host = get_s3_alias_host() @@ -257,6 +408,51 @@ def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target return False +def check_s3_object_exists(s3_file_path: str) -> bool: + """Check if an S3 object exists at the given path. + + Parameters + ---------- + s3_file_path : str + The S3 file path to check. + + Returns + ------- + bool + True if the object exists, False otherwise. + + Raises + ------ + ClientError + If there's an S3 error other than 404 (not found). + + """ + if not is_s3_enabled(): + return False + _cache_lookup_counter.labels(url=s3_file_path).inc() + try: + s3_client = create_s3_client() + bucket_name = get_s3_bucket_name() + + # Use head_object to check if the object exists without downloading it + s3_client.head_object(Bucket=bucket_name, Key=s3_file_path) + except ClientError as err: + # Object doesn't exist if we get a 404 error + error_code = err.response.get("Error", {}).get("Code") + if error_code == "404": + _cache_miss_counter.labels(url=s3_file_path).inc() + return False + # Re-raise other errors (permissions, etc.) + raise + except Exception: + # For any other exception, assume object doesn't exist + _cache_miss_counter.labels(url=s3_file_path).inc() + return False + else: + _cache_hit_counter.labels(url=s3_file_path).inc() + return True + + def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: """Get S3 URL for a given ingest ID if it exists.