Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ prompting_benchmark/data/*
!prompting_benchmark/data/screenshots/

datasets
sample_images

# C extensions
*.so
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ dependencies = [
"screeninfo>=0.8.1",
"pynput>=1.8.1",
"scikit-image>=0.25.2",
"google-genai>=1.45.0",
"google-genai>=1.52.0",
]

[tool.uv]
Expand Down
31 changes: 17 additions & 14 deletions src/label/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import argparse
from dotenv import load_dotenv

from label.discovery import discover_sessions, discover_video_sessions, create_single_config
from label.discovery import discover_sessions, discover_screenshots_sessions, create_single_config
from label.clients import create_client
from label.processor import Processor
from label.visualizer import Visualizer
Expand All @@ -20,9 +20,10 @@ def parse_args():
p.add_argument("--chunk-duration", type=int, default=60, help="Chunk duration in seconds")
p.add_argument("--fps", type=int, default=1, help="Frames per second for video processing")

p.add_argument("--video-only", action="store_true", help="Process video only without screenshots or annotations")
p.add_argument("--video-extensions", nargs="+", default=[".mp4", ".avi", ".mov", ".mkv"], help="Video file extensions to consider")
p.add_argument("--prompt-file", default=None, help="Path to prompt file (default: prompts/default.txt or prompts/video_only.txt if video only)")
p.add_argument("--screenshots-only", action="store_true", help="Process screenshots folder only without aggregations or annotations")
p.add_argument("--image-extensions", nargs="+", default=[".jpg", ".jpeg", ".png"], help="Image file extensions to consider")
p.add_argument("--max-time-gap", type=float, default=300.0, help="Maximum time gap (seconds) between images before forcing a video split (default: 120 = 2 minutes)")
p.add_argument("--prompt-file", default=None, help="Path to prompt file (default: prompts/default.txt or prompts/screenshots_only.txt if screenshots only)")
p.add_argument("--annotate", action="store_true", help="Annotate videos with cursor positions and clicks (only for standard processing)")
p.add_argument("--skip-existing", action="store_true", help="Skip sessions that have already been processed")
p.add_argument("--visualize", action="store_true", help="Create annotated video visualizations after processing")
Expand All @@ -39,7 +40,7 @@ def parse_args():
if not args.model:
args.model = 'gemini-2.5-flash' if args.client == 'gemini' else 'Qwen/Qwen3-VL-8B-Thinking-FP8'
if not args.prompt_file:
args.prompt_file = "prompts/video_only.txt" if args.video_only else "prompts/default.txt"
args.prompt_file = "prompts/screenshots_only.txt" if args.screenshots_only else "prompts/default.txt"

return args

Expand All @@ -49,15 +50,15 @@ def setup_configs(args):
configs = [create_single_config(
args.session,
args.chunk_duration,
args.video_only,
tuple(args.video_extensions),
args.screenshots_only,
tuple(args.image_extensions),
)]
else:
if args.video_only:
configs = discover_video_sessions(
if args.screenshots_only:
configs = discover_screenshots_sessions(
args.sessions_root,
args.chunk_duration,
tuple(args.video_extensions),
tuple(args.image_extensions),
)
else:
configs = discover_sessions(
Expand All @@ -82,14 +83,15 @@ def process_with_gemini(args, configs):
processor = Processor(
client=client,
num_workers=args.num_workers,
video_only=args.video_only,
screenshots_only=args.screenshots_only,
prompt_file=args.prompt_file,
max_time_gap=args.max_time_gap,
)

return processor.process_sessions(
configs,
fps=args.fps,
annotate=args.annotate and not args.video_only,
annotate=args.annotate and not args.screenshots_only,
)


Expand All @@ -103,14 +105,15 @@ def process_with_vllm(args, configs):
processor = Processor(
client=client,
num_workers=args.num_workers,
video_only=args.video_only,
screenshots_only=args.screenshots_only,
prompt_file=args.prompt_file,
max_time_gap=args.max_time_gap,
)

return processor.process_sessions(
configs,
fps=args.fps,
annotate=args.annotate and not args.video_only,
annotate=args.annotate and not args.screenshots_only,
)


Expand Down
63 changes: 55 additions & 8 deletions src/label/clients/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import os
import time
from label.clients.client import VLMClient, CAPTION_SCHEMA

from google import genai
from google.genai import types

Expand All @@ -13,6 +12,15 @@ def __init__(self, response):
self.response = response
self._json = None

# Extract token usage from response
self.input_tokens = 0
self.output_tokens = 0

if hasattr(response, 'usage_metadata'):
usage = response.usage_metadata
self.input_tokens = getattr(usage, 'prompt_token_count', 0)
self.output_tokens = getattr(usage, 'candidates_token_count', 0)

@property
def text(self) -> str:
return self.response.text
Expand All @@ -36,6 +44,10 @@ def __init__(self, api_key: Optional[str] = None, model_name: str = "gemini-2.5-
self.client = genai.Client(api_key=api_key)
self.model_name = model_name

# Token tracking
self.total_input_tokens = 0
self.total_output_tokens = 0

def upload_file(self, path: str) -> Any:
video_file = self.client.files.upload(file=path)

Expand All @@ -56,21 +68,56 @@ def upload_file(self, path: str) -> Any:

def generate(self, prompt: str, file_descriptor: Optional[Any] = None,
schema: Optional[Dict] = None) -> GeminiResponse:

inputs = [prompt]
if file_descriptor:
inputs.append(file_descriptor)

config = types.GenerateContentConfig(
response_mime_type="application/json",
temperature=0.0,
response_schema=schema or CAPTION_SCHEMA
)
if "gemini-3" in self.model_name:
config = types.GenerateContentConfig(
response_mime_type="application/json",
temperature=0.0,
response_schema=schema or CAPTION_SCHEMA,
thinking_config=types.ThinkingConfig(thinking_level="high"),
media_resolution=types.MediaResolution.MEDIA_RESOLUTION_HIGH
)
else:
config = types.GenerateContentConfig(
response_mime_type="application/json",
temperature=0.0,
response_schema=schema or CAPTION_SCHEMA,
)

res = self.client.models.generate_content(
model=self.model_name,
contents=inputs,
config=config
)

return GeminiResponse(res)
response = GeminiResponse(res)

# Track tokens
self.total_input_tokens += response.input_tokens
self.total_output_tokens += response.output_tokens

return response

def get_token_stats(self) -> Dict[str, int]:
"""Get current token usage statistics."""
return {
"input_tokens": self.total_input_tokens,
"output_tokens": self.total_output_tokens,
"total_tokens": self.total_input_tokens + self.total_output_tokens
}

def reset_token_stats(self):
"""Reset token counters."""
self.total_input_tokens = 0
self.total_output_tokens = 0

def print_token_stats(self, prefix: str = ""):
"""Print token usage statistics."""
stats = self.get_token_stats()
print(f"\n{prefix}Token Usage:")
print(f" Input tokens: {stats['input_tokens']:,}")
print(f" Output tokens: {stats['output_tokens']:,}")
print(f" Total tokens: {stats['total_tokens']:,}")
59 changes: 29 additions & 30 deletions src/label/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ def discover_sessions(
return configs


def discover_video_sessions(
def discover_screenshots_sessions(
sessions_root: Path,
chunk_duration: int = 60,
video_exts: Tuple[str, ...] = (".mp4", ".avi", ".mov", ".mkv")
image_exts: Tuple[str, ...] = (".jpg", ".jpeg", ".png")
) -> List[SessionConfig]:

if not sessions_root.exists():
Expand All @@ -58,23 +58,22 @@ def discover_video_sessions(
if not session_dir.is_dir():
continue

video_files = [
f for f in session_dir.iterdir()
if f.is_file() and f.suffix.lower() in video_exts
]
# Look for screenshots directory
screenshots_dir = session_dir / "screenshots"
if not screenshots_dir.exists():
continue

video_subdir = session_dir / "video"
if video_subdir.exists():
video_files.extend([
f for f in video_subdir.iterdir()
if f.is_file() and f.suffix.lower() in video_exts
])
# Check if there are any image files
image_files = [
f for f in screenshots_dir.iterdir()
if f.is_file() and f.suffix.lower() in image_exts
]

if video_files:
if image_files:
configs.append(SessionConfig(
session_folder=session_dir,
chunk_duration=chunk_duration,
video_path=VideoPath(video_files[0])
_screenshots_dir=screenshots_dir
))

return configs
Expand All @@ -83,31 +82,31 @@ def discover_video_sessions(
def create_single_config(
session_dir: Path,
chunk_duration: int,
video_only: bool,
video_exts: Tuple[str, ...],
screenshots_only: bool,
image_exts: Tuple[str, ...],
prompt: str = ""
) -> SessionConfig:

if video_only:
video_files = [
f for f in session_dir.iterdir()
if f.is_file() and f.suffix.lower() in video_exts
if screenshots_only:
# Check if there's a screenshots subdirectory first
screenshots_dir = session_dir / "screenshots"
if screenshots_dir.exists() and screenshots_dir.is_dir():
search_dir = screenshots_dir
else:
search_dir = session_dir

image_files = [
f for f in search_dir.iterdir()
if f.is_file() and f.suffix.lower() in image_exts
]

video_subdir = session_dir / "video"
if video_subdir.exists():
video_files.extend([
f for f in video_subdir.iterdir()
if f.is_file() and f.suffix.lower() in video_exts
])

if not video_files:
raise RuntimeError(f"No video files found in {session_dir}")
if not image_files:
raise RuntimeError(f"No image files found in {search_dir}")

return SessionConfig(
session_folder=session_dir,
chunk_duration=chunk_duration,
video_path=VideoPath(video_files[0])
_screenshots_dir=search_dir
)
else:
return SessionConfig(
Expand Down
3 changes: 3 additions & 0 deletions src/label/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ class SessionConfig:
chunk_duration: int = 60
video_path: Optional[VideoPath] = None
agg_path: Optional[Path] = None
_screenshots_dir: Optional[Path] = None

@property
def session_id(self) -> str:
Expand All @@ -468,6 +469,8 @@ def aggregations_dir(self) -> Path:

@property
def screenshots_dir(self) -> Path:
if self._screenshots_dir is not None:
return self._screenshots_dir
return self.session_folder / "screenshots"

@property
Expand Down
Loading