GeneralUserModels · valteu · Nov 19, 2025 · Nov 19, 2025 · Nov 25, 2025 · Dec 2, 2025
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ prompting_benchmark/data/*
 !prompting_benchmark/data/screenshots/
 
 datasets
+sample_images
 
 # C extensions
 *.so

diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,7 @@ dependencies = [
     "screeninfo>=0.8.1",
     "pynput>=1.8.1",
     "scikit-image>=0.25.2",
-    "google-genai>=1.45.0",
+    "google-genai>=1.52.0",
 ]
 
 [tool.uv]

diff --git a/src/label/__main__.py b/src/label/__main__.py
@@ -2,7 +2,7 @@
 import argparse
 from dotenv import load_dotenv
 
-from label.discovery import discover_sessions, discover_video_sessions, create_single_config
+from label.discovery import discover_sessions, discover_screenshots_sessions, create_single_config
 from label.clients import create_client
 from label.processor import Processor
 from label.visualizer import Visualizer
@@ -20,9 +20,10 @@ def parse_args():
     p.add_argument("--chunk-duration", type=int, default=60, help="Chunk duration in seconds")
     p.add_argument("--fps", type=int, default=1, help="Frames per second for video processing")
 
-    p.add_argument("--video-only", action="store_true", help="Process video only without screenshots or annotations")
-    p.add_argument("--video-extensions", nargs="+", default=[".mp4", ".avi", ".mov", ".mkv"], help="Video file extensions to consider")
-    p.add_argument("--prompt-file", default=None, help="Path to prompt file (default: prompts/default.txt or prompts/video_only.txt if video only)")
+    p.add_argument("--screenshots-only", action="store_true", help="Process screenshots folder only without aggregations or annotations")
+    p.add_argument("--image-extensions", nargs="+", default=[".jpg", ".jpeg", ".png"], help="Image file extensions to consider")
+    p.add_argument("--max-time-gap", type=float, default=300.0, help="Maximum time gap (seconds) between images before forcing a video split (default: 120 = 2 minutes)")
+    p.add_argument("--prompt-file", default=None, help="Path to prompt file (default: prompts/default.txt or prompts/screenshots_only.txt if screenshots only)")
     p.add_argument("--annotate", action="store_true", help="Annotate videos with cursor positions and clicks (only for standard processing)")
     p.add_argument("--skip-existing", action="store_true", help="Skip sessions that have already been processed")
     p.add_argument("--visualize", action="store_true", help="Create annotated video visualizations after processing")
@@ -39,7 +40,7 @@ def parse_args():
     if not args.model:
         args.model = 'gemini-2.5-flash' if args.client == 'gemini' else 'Qwen/Qwen3-VL-8B-Thinking-FP8'
     if not args.prompt_file:
-        args.prompt_file = "prompts/video_only.txt" if args.video_only else "prompts/default.txt"
+        args.prompt_file = "prompts/screenshots_only.txt" if args.screenshots_only else "prompts/default.txt"
 
     return args
 
@@ -49,15 +50,15 @@ def setup_configs(args):
         configs = [create_single_config(
             args.session,
             args.chunk_duration,
-            args.video_only,
-            tuple(args.video_extensions),
+            args.screenshots_only,
+            tuple(args.image_extensions),
         )]
     else:
-        if args.video_only:
-            configs = discover_video_sessions(
+        if args.screenshots_only:
+            configs = discover_screenshots_sessions(
                 args.sessions_root,
                 args.chunk_duration,
-                tuple(args.video_extensions),
+                tuple(args.image_extensions),
             )
         else:
             configs = discover_sessions(
@@ -82,14 +83,15 @@ def process_with_gemini(args, configs):
     processor = Processor(
         client=client,
         num_workers=args.num_workers,
-        video_only=args.video_only,
+        screenshots_only=args.screenshots_only,
         prompt_file=args.prompt_file,
+        max_time_gap=args.max_time_gap,
     )
 
     return processor.process_sessions(
         configs,
         fps=args.fps,
-        annotate=args.annotate and not args.video_only,
+        annotate=args.annotate and not args.screenshots_only,
     )
 
 
@@ -103,14 +105,15 @@ def process_with_vllm(args, configs):
     processor = Processor(
         client=client,
         num_workers=args.num_workers,
-        video_only=args.video_only,
+        screenshots_only=args.screenshots_only,
         prompt_file=args.prompt_file,
+        max_time_gap=args.max_time_gap,
     )
 
     return processor.process_sessions(
         configs,
         fps=args.fps,
-        annotate=args.annotate and not args.video_only,
+        annotate=args.annotate and not args.screenshots_only,
     )
 
 

diff --git a/src/label/clients/gemini.py b/src/label/clients/gemini.py
@@ -3,7 +3,6 @@
 import os
 import time
 from label.clients.client import VLMClient, CAPTION_SCHEMA
-
 from google import genai
 from google.genai import types
 
@@ -13,6 +12,15 @@ def __init__(self, response):
         self.response = response
         self._json = None
 
+        # Extract token usage from response
+        self.input_tokens = 0
+        self.output_tokens = 0
+
+        if hasattr(response, 'usage_metadata'):
+            usage = response.usage_metadata
+            self.input_tokens = getattr(usage, 'prompt_token_count', 0)
+            self.output_tokens = getattr(usage, 'candidates_token_count', 0)
+
     @property
     def text(self) -> str:
         return self.response.text
@@ -36,6 +44,10 @@ def __init__(self, api_key: Optional[str] = None, model_name: str = "gemini-2.5-
         self.client = genai.Client(api_key=api_key)
         self.model_name = model_name
 
+        # Token tracking
+        self.total_input_tokens = 0
+        self.total_output_tokens = 0
+
     def upload_file(self, path: str) -> Any:
         video_file = self.client.files.upload(file=path)
 
@@ -56,21 +68,56 @@ def upload_file(self, path: str) -> Any:
 
     def generate(self, prompt: str, file_descriptor: Optional[Any] = None,
                  schema: Optional[Dict] = None) -> GeminiResponse:
-
         inputs = [prompt]
         if file_descriptor:
             inputs.append(file_descriptor)
 
-        config = types.GenerateContentConfig(
-            response_mime_type="application/json",
-            temperature=0.0,
-            response_schema=schema or CAPTION_SCHEMA
-        )
+        if "gemini-3" in self.model_name:
+            config = types.GenerateContentConfig(
+                response_mime_type="application/json",
+                temperature=0.0,
+                response_schema=schema or CAPTION_SCHEMA,
+                thinking_config=types.ThinkingConfig(thinking_level="high"),
+                media_resolution=types.MediaResolution.MEDIA_RESOLUTION_HIGH
+            )
+        else:
+            config = types.GenerateContentConfig(
+                response_mime_type="application/json",
+                temperature=0.0,
+                response_schema=schema or CAPTION_SCHEMA,
+            )
 
         res = self.client.models.generate_content(
             model=self.model_name,
             contents=inputs,
             config=config
         )
 
-        return GeminiResponse(res)
+        response = GeminiResponse(res)
+
+        # Track tokens
+        self.total_input_tokens += response.input_tokens
+        self.total_output_tokens += response.output_tokens
+
+        return response
+
+    def get_token_stats(self) -> Dict[str, int]:
+        """Get current token usage statistics."""
+        return {
+            "input_tokens": self.total_input_tokens,
+            "output_tokens": self.total_output_tokens,
+            "total_tokens": self.total_input_tokens + self.total_output_tokens
+        }
+
+    def reset_token_stats(self):
+        """Reset token counters."""
+        self.total_input_tokens = 0
+        self.total_output_tokens = 0
+
+    def print_token_stats(self, prefix: str = ""):
+        """Print token usage statistics."""
+        stats = self.get_token_stats()
+        print(f"\n{prefix}Token Usage:")
+        print(f"  Input tokens:  {stats['input_tokens']:,}")
+        print(f"  Output tokens: {stats['output_tokens']:,}")
+        print(f"  Total tokens:  {stats['total_tokens']:,}")
diff --git a/src/label/discovery.py b/src/label/discovery.py
@@ -43,10 +43,10 @@ def discover_sessions(
     return configs
 
 
-def discover_video_sessions(
+def discover_screenshots_sessions(
     sessions_root: Path,
     chunk_duration: int = 60,
-    video_exts: Tuple[str, ...] = (".mp4", ".avi", ".mov", ".mkv")
+    image_exts: Tuple[str, ...] = (".jpg", ".jpeg", ".png")
 ) -> List[SessionConfig]:
 
     if not sessions_root.exists():
@@ -58,23 +58,22 @@ def discover_video_sessions(
         if not session_dir.is_dir():
             continue
 
-        video_files = [
-            f for f in session_dir.iterdir()
-            if f.is_file() and f.suffix.lower() in video_exts
-        ]
+        # Look for screenshots directory
+        screenshots_dir = session_dir / "screenshots"
+        if not screenshots_dir.exists():
+            continue
 
-        video_subdir = session_dir / "video"
-        if video_subdir.exists():
-            video_files.extend([
-                f for f in video_subdir.iterdir()
-                if f.is_file() and f.suffix.lower() in video_exts
-            ])
+        # Check if there are any image files
+        image_files = [
+            f for f in screenshots_dir.iterdir()
+            if f.is_file() and f.suffix.lower() in image_exts
+        ]
 
-        if video_files:
+        if image_files:
             configs.append(SessionConfig(
                 session_folder=session_dir,
                 chunk_duration=chunk_duration,
-                video_path=VideoPath(video_files[0])
+                _screenshots_dir=screenshots_dir
             ))
 
     return configs
@@ -83,31 +82,31 @@ def discover_video_sessions(
 def create_single_config(
     session_dir: Path,
     chunk_duration: int,
-    video_only: bool,
-    video_exts: Tuple[str, ...],
+    screenshots_only: bool,
+    image_exts: Tuple[str, ...],
     prompt: str = ""
 ) -> SessionConfig:
 
-    if video_only:
-        video_files = [
-            f for f in session_dir.iterdir()
-            if f.is_file() and f.suffix.lower() in video_exts
+    if screenshots_only:
+        # Check if there's a screenshots subdirectory first
+        screenshots_dir = session_dir / "screenshots"
+        if screenshots_dir.exists() and screenshots_dir.is_dir():
+            search_dir = screenshots_dir
+        else:
+            search_dir = session_dir
+
+        image_files = [
+            f for f in search_dir.iterdir()
+            if f.is_file() and f.suffix.lower() in image_exts
         ]
 
-        video_subdir = session_dir / "video"
-        if video_subdir.exists():
-            video_files.extend([
-                f for f in video_subdir.iterdir()
-                if f.is_file() and f.suffix.lower() in video_exts
-            ])
-
-        if not video_files:
-            raise RuntimeError(f"No video files found in {session_dir}")
+        if not image_files:
+            raise RuntimeError(f"No image files found in {search_dir}")
 
         return SessionConfig(
             session_folder=session_dir,
             chunk_duration=chunk_duration,
-            video_path=VideoPath(video_files[0])
+            _screenshots_dir=search_dir
         )
     else:
         return SessionConfig(

diff --git a/src/label/models.py b/src/label/models.py
@@ -449,6 +449,7 @@ class SessionConfig:
     chunk_duration: int = 60
     video_path: Optional[VideoPath] = None
     agg_path: Optional[Path] = None
+    _screenshots_dir: Optional[Path] = None
 
     @property
     def session_id(self) -> str:
@@ -468,6 +469,8 @@ def aggregations_dir(self) -> Path:
 
     @property
     def screenshots_dir(self) -> Path:
+        if self._screenshots_dir is not None:
+            return self._screenshots_dir
         return self.session_folder / "screenshots"
 
     @property