From 579894d36aad22231f914c96d3de81df81748715 Mon Sep 17 00:00:00 2001 From: mickael Date: Mon, 28 Jul 2025 17:46:38 +0200 Subject: [PATCH 1/5] feat: serve cached digest if available --- src/server/query_processor.py | 254 +++++++++++++++++++++++++++++----- src/server/s3_utils.py | 186 +++++++++++++++++++++++++ 2 files changed, 404 insertions(+), 36 deletions(-) diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 172330ac..476dd9bf 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -2,19 +2,219 @@ from __future__ import annotations +import logging from pathlib import Path -from typing import cast +from typing import TYPE_CHECKING, cast from gitingest.clone import clone_repo from gitingest.ingestion import ingest_query from gitingest.query_parser import parse_remote_repo -from gitingest.utils.git_utils import validate_github_token +from gitingest.utils.git_utils import resolve_commit, validate_github_token from gitingest.utils.pattern_utils import process_patterns from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType -from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3 +from server.s3_utils import ( + _build_s3_url, + check_s3_object_exists, + generate_s3_file_path, + get_metadata_from_s3, + is_s3_enabled, + upload_metadata_to_s3, + upload_to_s3, +) from server.server_config import MAX_DISPLAY_SIZE from server.server_utils import Colors +if TYPE_CHECKING: + from gitingest.schemas.cloning import CloneConfig + from gitingest.schemas.ingestion import IngestionQuery + +logger = logging.getLogger(__name__) + + +async def _check_s3_cache( + query: IngestionQuery, + input_text: str, + max_file_size: int, + pattern_type: str, + pattern: str, + token: str | None, +) -> IngestSuccessResponse | None: + """Check if digest already exists on S3 and return response if found. + + Parameters + ---------- + query : IngestionQuery + The parsed query object. + input_text : str + Original input text. + max_file_size : int + Maximum file size in KB. + pattern_type : str + Pattern type (include/exclude). + pattern : str + Pattern string. + token : str | None + GitHub token. + + Returns + ------- + IngestSuccessResponse | None + Response if file exists on S3, None otherwise. + + """ + if not is_s3_enabled(): + return None + + try: + # Use git ls-remote to get commit SHA without cloning + clone_config = query.extract_clone_config() + commit_sha = await resolve_commit(clone_config, token=token) + query.commit = commit_sha + + # Generate S3 file path using the resolved commit + s3_file_path = generate_s3_file_path( + source=query.url, + user_name=cast("str", query.user_name), + repo_name=cast("str", query.repo_name), + commit=commit_sha, + include_patterns=query.include_patterns, + ignore_patterns=query.ignore_patterns, + ) + + # Check if file exists on S3 + if check_s3_object_exists(s3_file_path): + # File exists on S3, serve it directly without cloning + s3_url = _build_s3_url(s3_file_path) + query.s3_url = s3_url + + short_repo_url = f"{query.user_name}/{query.repo_name}" + + # Try to get cached metadata + metadata = get_metadata_from_s3(s3_file_path) + + if metadata: + # Use cached metadata if available + summary = metadata.get( + "summary", + "Digest served from cache (S3). Download the full digest to see content details.", + ) + tree = metadata.get("tree", "Digest served from cache. Download the full digest to see the file tree.") + content = metadata.get( + "content", + "Digest served from cache. Download the full digest to see the content.", + ) + else: + # Fallback to placeholder messages if metadata not available + summary = "Digest served from cache (S3). Download the full digest to see content details." + tree = "Digest served from cache. Download the full digest to see the file tree." + content = "Digest served from cache. Download the full digest to see the content." + + return IngestSuccessResponse( + repo_url=input_text, + short_repo_url=short_repo_url, + summary=summary, + digest_url=s3_url, + tree=tree, + content=content, + default_max_file_size=max_file_size, + pattern_type=pattern_type, + pattern=pattern, + ) + except Exception as exc: + # Log the exception but don't fail the entire request + logger.warning("S3 cache check failed, falling back to normal cloning: %s", exc) + + return None + + +def _store_digest_content( + query: IngestionQuery, + clone_config: CloneConfig, + digest_content: str, + summary: str, + tree: str, + content: str, +) -> None: + """Store digest content either to S3 or locally based on configuration. + + Parameters + ---------- + query : IngestionQuery + The query object containing repository information. + clone_config : CloneConfig + The clone configuration object. + digest_content : str + The complete digest content to store. + summary : str + The summary content for metadata. + tree : str + The tree content for metadata. + content : str + The file content for metadata. + + """ + if is_s3_enabled(): + # Upload to S3 instead of storing locally + s3_file_path = generate_s3_file_path( + source=query.url, + user_name=cast("str", query.user_name), + repo_name=cast("str", query.repo_name), + commit=query.commit, + include_patterns=query.include_patterns, + ignore_patterns=query.ignore_patterns, + ) + s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id) + + # Also upload metadata JSON for caching + metadata = { + "summary": summary, + "tree": tree, + "content": content, + } + try: + upload_metadata_to_s3(metadata=metadata, s3_file_path=s3_file_path, ingest_id=query.id) + logger.debug("Successfully uploaded metadata to S3") + except Exception as metadata_exc: + # Log the error but don't fail the entire request + logger.warning("Failed to upload metadata to S3: %s", metadata_exc) + + # Store S3 URL in query for later use + query.s3_url = s3_url + else: + # Store locally + local_txt_file = Path(clone_config.local_path).with_suffix(".txt") + with local_txt_file.open("w", encoding="utf-8") as f: + f.write(digest_content) + + +def _generate_digest_url(query: IngestionQuery) -> str: + """Generate the digest URL based on S3 configuration. + + Parameters + ---------- + query : IngestionQuery + The query object containing repository information. + + Returns + ------- + str + The digest URL. + + Raises + ------ + RuntimeError + If S3 is enabled but no S3 URL was generated. + + """ + if is_s3_enabled(): + digest_url = getattr(query, "s3_url", None) + if not digest_url: + # This should not happen if S3 upload was successful + msg = "S3 is enabled but no S3 URL was generated" + raise RuntimeError(msg) + return digest_url + return f"/api/download/file/{query.id}" + async def process_query( input_text: str, @@ -69,10 +269,22 @@ async def process_query( include_patterns=pattern if pattern_type == PatternType.INCLUDE else None, ) + # Check if digest already exists on S3 before cloning + s3_response = await _check_s3_cache( + query=query, + input_text=input_text, + max_file_size=max_file_size, + pattern_type=pattern_type.value, + pattern=pattern, + token=token, + ) + if s3_response: + return s3_response + clone_config = query.extract_clone_config() await clone_repo(clone_config, token=token) - short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "/" for the page title + short_repo_url = f"{query.user_name}/{query.repo_name}" # The commit hash should always be available at this point if not query.commit: @@ -81,30 +293,8 @@ async def process_query( try: summary, tree, content = ingest_query(query) - - # Prepare the digest content (tree + content) digest_content = tree + "\n" + content - - # Store digest based on S3 configuration - if is_s3_enabled(): - # Upload to S3 instead of storing locally - s3_file_path = generate_s3_file_path( - source=query.url, - user_name=cast("str", query.user_name), - repo_name=cast("str", query.repo_name), - commit=query.commit, - include_patterns=query.include_patterns, - ignore_patterns=query.ignore_patterns, - ) - s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id) - # Store S3 URL in query for later use - query.s3_url = s3_url - else: - # Store locally - local_txt_file = Path(clone_config.local_path).with_suffix(".txt") - with local_txt_file.open("w", encoding="utf-8") as f: - f.write(digest_content) - + _store_digest_content(query, clone_config, digest_content, summary, tree, content) except Exception as exc: _print_error(query.url, exc, max_file_size, pattern_type, pattern) return IngestErrorResponse(error=str(exc)) @@ -123,15 +313,7 @@ async def process_query( summary=summary, ) - # Generate digest_url based on S3 configuration - if is_s3_enabled(): - digest_url = getattr(query, "s3_url", None) - if not digest_url: - # This should not happen if S3 upload was successful - msg = "S3 is enabled but no S3 URL was generated" - raise RuntimeError(msg) - else: - digest_url = f"/api/download/file/{query.id}" + digest_url = _generate_digest_url(query) return IngestSuccessResponse( repo_url=input_text, diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index a30a957f..26fb32e0 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -3,6 +3,7 @@ from __future__ import annotations import hashlib +import json import logging import os from typing import TYPE_CHECKING @@ -231,6 +232,149 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: return public_url +def upload_metadata_to_s3(metadata: dict, s3_file_path: str, ingest_id: UUID) -> str: + """Upload metadata JSON to S3 alongside the digest file. + + Parameters + ---------- + metadata : dict + The metadata dictionary containing summary, tree, and content. + s3_file_path : str + The S3 file path for the digest (metadata will use .json extension). + ingest_id : UUID + The ingest ID to store as an S3 object tag. + + Returns + ------- + str + Public URL to access the uploaded metadata file. + + Raises + ------ + ValueError + If S3 is not enabled. + S3UploadError + If the upload to S3 fails. + + """ + if not is_s3_enabled(): + msg = "S3 is not enabled" + logger.error(msg) + raise ValueError(msg) + + # Generate metadata file path by replacing .txt with .json + metadata_file_path = s3_file_path.replace(".txt", ".json") + + s3_client = create_s3_client() + bucket_name = get_s3_bucket_name() + + extra_fields = { + "bucket_name": bucket_name, + "metadata_file_path": metadata_file_path, + "ingest_id": str(ingest_id), + "metadata_size": len(json.dumps(metadata)), + } + + # Log upload attempt + logger.debug("Starting S3 metadata upload", extra=extra_fields) + + try: + # Upload the metadata with ingest_id as tag + s3_client.put_object( + Bucket=bucket_name, + Key=metadata_file_path, + Body=json.dumps(metadata, indent=2).encode("utf-8"), + ContentType="application/json", + Tagging=f"ingest_id={ingest_id!s}", + ) + except ClientError as err: + # Log upload failure + logger.exception( + "S3 metadata upload failed", + extra={ + "bucket_name": bucket_name, + "metadata_file_path": metadata_file_path, + "ingest_id": str(ingest_id), + "error_code": err.response.get("Error", {}).get("Code"), + "error_message": str(err), + }, + ) + msg = f"Failed to upload metadata to S3: {err}" + raise S3UploadError(msg) from err + + # Generate public URL + alias_host = get_s3_alias_host() + if alias_host: + # Use alias host if configured + public_url = f"{alias_host.rstrip('/')}/{metadata_file_path}" + else: + # Fallback to direct S3 URL + endpoint = get_s3_config().get("endpoint_url") + if endpoint: + public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{metadata_file_path}" + else: + public_url = ( + f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{metadata_file_path}" + ) + + # Log successful upload + logger.debug( + "S3 metadata upload completed successfully", + extra={ + "bucket_name": bucket_name, + "metadata_file_path": metadata_file_path, + "ingest_id": str(ingest_id), + "public_url": public_url, + }, + ) + + return public_url + + +def get_metadata_from_s3(s3_file_path: str) -> dict | None: + """Retrieve metadata JSON from S3. + + Parameters + ---------- + s3_file_path : str + The S3 file path for the digest (metadata will use .json extension). + + Returns + ------- + dict | None + The metadata dictionary if found, None otherwise. + + """ + if not is_s3_enabled(): + return None + + # Generate metadata file path by replacing .txt with .json + metadata_file_path = s3_file_path.replace(".txt", ".json") + + try: + s3_client = create_s3_client() + bucket_name = get_s3_bucket_name() + + # Get the metadata object + response = s3_client.get_object(Bucket=bucket_name, Key=metadata_file_path) + metadata_content = response["Body"].read().decode("utf-8") + + return json.loads(metadata_content) + except ClientError as err: + # Object doesn't exist if we get a 404 error + error_code = err.response.get("Error", {}).get("Code") + if error_code == "404": + logger.debug("Metadata file not found: %s", metadata_file_path) + return None + # Log other errors but don't fail + logger.warning("Failed to retrieve metadata from S3: %s", err) + return None + except Exception as exc: + # For any other exception, log and return None + logger.warning("Unexpected error retrieving metadata from S3: %s", exc) + return None + + def _build_s3_url(key: str) -> str: """Build S3 URL for a given key.""" alias_host = get_s3_alias_host() @@ -257,6 +401,48 @@ def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target return False +def check_s3_object_exists(s3_file_path: str) -> bool: + """Check if an S3 object exists at the given path. + + Parameters + ---------- + s3_file_path : str + The S3 file path to check. + + Returns + ------- + bool + True if the object exists, False otherwise. + + Raises + ------ + ClientError + If there's an S3 error other than 404 (not found). + + """ + if not is_s3_enabled(): + return False + + try: + s3_client = create_s3_client() + bucket_name = get_s3_bucket_name() + + # Use head_object to check if the object exists without downloading it + s3_client.head_object(Bucket=bucket_name, Key=s3_file_path) + except ClientError as err: + # Object doesn't exist if we get a 404 error + error_code = err.response.get("Error", {}).get("Code") + if error_code == "404": + return False + # Re-raise other errors (permissions, etc.) + raise + except Exception: + # For any other exception, assume object doesn't exist + return False + else: + return True + + def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: """Get S3 URL for a given ingest ID if it exists. From 9af0dcc180ba93505338f0e9183eb60ec7b8e71e Mon Sep 17 00:00:00 2001 From: mickael Date: Mon, 28 Jul 2025 19:23:13 +0200 Subject: [PATCH 2/5] feat: add S3Metadata model for structured metadata handling --- src/server/models.py | 19 +++++++++++++++++++ src/server/query_processor.py | 24 +++++++++--------------- src/server/s3_utils.py | 22 ++++++++++++---------- 3 files changed, 40 insertions(+), 25 deletions(-) diff --git a/src/server/models.py b/src/server/models.py index 533da611..97739416 100644 --- a/src/server/models.py +++ b/src/server/models.py @@ -116,6 +116,25 @@ class IngestErrorResponse(BaseModel): IngestResponse = Union[IngestSuccessResponse, IngestErrorResponse] +class S3Metadata(BaseModel): + """Model for S3 metadata structure. + + Attributes + ---------- + summary : str + Summary of the ingestion process including token estimates. + tree : str + File tree structure of the repository. + content : str + Processed content from the repository files. + + """ + + summary: str = Field(..., description="Ingestion summary with token estimates") + tree: str = Field(..., description="File tree structure") + content: str = Field(..., description="Processed file content") + + class QueryForm(BaseModel): """Form data for the query. diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 476dd9bf..12924d2f 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -11,7 +11,7 @@ from gitingest.query_parser import parse_remote_repo from gitingest.utils.git_utils import resolve_commit, validate_github_token from gitingest.utils.pattern_utils import process_patterns -from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType +from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType, S3Metadata from server.s3_utils import ( _build_s3_url, check_s3_object_exists, @@ -94,15 +94,9 @@ async def _check_s3_cache( if metadata: # Use cached metadata if available - summary = metadata.get( - "summary", - "Digest served from cache (S3). Download the full digest to see content details.", - ) - tree = metadata.get("tree", "Digest served from cache. Download the full digest to see the file tree.") - content = metadata.get( - "content", - "Digest served from cache. Download the full digest to see the content.", - ) + summary = metadata.summary + tree = metadata.tree + content = metadata.content else: # Fallback to placeholder messages if metadata not available summary = "Digest served from cache (S3). Download the full digest to see content details." @@ -166,11 +160,11 @@ def _store_digest_content( s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id) # Also upload metadata JSON for caching - metadata = { - "summary": summary, - "tree": tree, - "content": content, - } + metadata = S3Metadata( + summary=summary, + tree=tree, + content=content, + ) try: upload_metadata_to_s3(metadata=metadata, s3_file_path=s3_file_path, ingest_id=query.id) logger.debug("Successfully uploaded metadata to S3") diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index 26fb32e0..e1d38cfa 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -3,7 +3,6 @@ from __future__ import annotations import hashlib -import json import logging import os from typing import TYPE_CHECKING @@ -13,9 +12,12 @@ import boto3 from botocore.exceptions import ClientError +from server.models import S3Metadata + if TYPE_CHECKING: from botocore.client import BaseClient + # Initialize logger for this module logger = logging.getLogger(__name__) @@ -232,13 +234,13 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: return public_url -def upload_metadata_to_s3(metadata: dict, s3_file_path: str, ingest_id: UUID) -> str: +def upload_metadata_to_s3(metadata: S3Metadata, s3_file_path: str, ingest_id: UUID) -> str: """Upload metadata JSON to S3 alongside the digest file. Parameters ---------- - metadata : dict - The metadata dictionary containing summary, tree, and content. + metadata : S3Metadata + The metadata struct containing summary, tree, and content. s3_file_path : str The S3 file path for the digest (metadata will use .json extension). ingest_id : UUID @@ -272,7 +274,7 @@ def upload_metadata_to_s3(metadata: dict, s3_file_path: str, ingest_id: UUID) -> "bucket_name": bucket_name, "metadata_file_path": metadata_file_path, "ingest_id": str(ingest_id), - "metadata_size": len(json.dumps(metadata)), + "metadata_size": len(metadata.model_dump_json()), } # Log upload attempt @@ -283,7 +285,7 @@ def upload_metadata_to_s3(metadata: dict, s3_file_path: str, ingest_id: UUID) -> s3_client.put_object( Bucket=bucket_name, Key=metadata_file_path, - Body=json.dumps(metadata, indent=2).encode("utf-8"), + Body=metadata.model_dump_json(indent=2).encode("utf-8"), ContentType="application/json", Tagging=f"ingest_id={ingest_id!s}", ) @@ -331,7 +333,7 @@ def upload_metadata_to_s3(metadata: dict, s3_file_path: str, ingest_id: UUID) -> return public_url -def get_metadata_from_s3(s3_file_path: str) -> dict | None: +def get_metadata_from_s3(s3_file_path: str) -> S3Metadata | None: """Retrieve metadata JSON from S3. Parameters @@ -341,8 +343,8 @@ def get_metadata_from_s3(s3_file_path: str) -> dict | None: Returns ------- - dict | None - The metadata dictionary if found, None otherwise. + S3Metadata | None + The metadata struct if found, None otherwise. """ if not is_s3_enabled(): @@ -359,7 +361,7 @@ def get_metadata_from_s3(s3_file_path: str) -> dict | None: response = s3_client.get_object(Bucket=bucket_name, Key=metadata_file_path) metadata_content = response["Body"].read().decode("utf-8") - return json.loads(metadata_content) + return S3Metadata.model_validate_json(metadata_content) except ClientError as err: # Object doesn't exist if we get a 404 error error_code = err.response.get("Error", {}).get("Code") From 88f0754e2c861121f74648965487f85120307f1d Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Mon, 28 Jul 2025 22:43:37 +0200 Subject: [PATCH 3/5] feat: add prometheus metrics for s3 cache --- src/server/s3_utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index e1d38cfa..f7f14ad0 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -11,6 +11,7 @@ import boto3 from botocore.exceptions import ClientError +from prometheus_client import Counter from server.models import S3Metadata @@ -21,6 +22,10 @@ # Initialize logger for this module logger = logging.getLogger(__name__) +_cache_lookup_counter = Counter("gitingest_cache_lookup", "Number of cache lookups", ["url"]) +_cache_hit_counter = Counter("gitingest_cache_hit", "Number of cache hits", ["url"]) +_cache_miss_counter = Counter("gitingest_cache_miss", "Number of cache misses", ["url"]) + class S3UploadError(Exception): """Custom exception for S3 upload failures.""" @@ -424,7 +429,7 @@ def check_s3_object_exists(s3_file_path: str) -> bool: """ if not is_s3_enabled(): return False - + _cache_lookup_counter.labels(url=s3_file_path).inc() try: s3_client = create_s3_client() bucket_name = get_s3_bucket_name() @@ -435,13 +440,16 @@ def check_s3_object_exists(s3_file_path: str) -> bool: # Object doesn't exist if we get a 404 error error_code = err.response.get("Error", {}).get("Code") if error_code == "404": + _cache_miss_counter.labels(url=s3_file_path).inc() return False # Re-raise other errors (permissions, etc.) raise except Exception: # For any other exception, assume object doesn't exist + _cache_miss_counter.labels(url=s3_file_path).inc() return False else: + _cache_hit_counter.labels(url=s3_file_path).inc() return True From 34fe40a608dd9fd8d75b5053850fde9773bd41a9 Mon Sep 17 00:00:00 2001 From: mickael Date: Tue, 29 Jul 2025 16:45:17 +0200 Subject: [PATCH 4/5] refactor: simplify query commit resolution Combine steps to resolve and assign commit in query_processor. --- src/server/query_processor.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 12924d2f..7a55bfd1 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -68,15 +68,13 @@ async def _check_s3_cache( try: # Use git ls-remote to get commit SHA without cloning clone_config = query.extract_clone_config() - commit_sha = await resolve_commit(clone_config, token=token) - query.commit = commit_sha - + query.commit = await resolve_commit(clone_config, token=token) # Generate S3 file path using the resolved commit s3_file_path = generate_s3_file_path( source=query.url, user_name=cast("str", query.user_name), repo_name=cast("str", query.repo_name), - commit=commit_sha, + commit=query.commit, include_patterns=query.include_patterns, ignore_patterns=query.ignore_patterns, ) From 48fc139bb850b8f3c243908a2e372620f79567cf Mon Sep 17 00:00:00 2001 From: mickael Date: Tue, 29 Jul 2025 16:57:52 +0200 Subject: [PATCH 5/5] ci: disable fail-fast in matrix strategy --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 17e6628a..ed498934 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: test: runs-on: ${{ matrix.os }} strategy: - fail-fast: true + fail-fast: false matrix: os: [ubuntu-latest, macos-latest, windows-latest] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]