diff --git a/LESSONS_LEARNED.md b/LESSONS_LEARNED.md new file mode 100644 index 00000000..1e45b39c --- /dev/null +++ b/LESSONS_LEARNED.md @@ -0,0 +1,269 @@ +# Lessons Learned - Stdin Support Implementation + +## Session Date: 2026-02-07 +## Feature: Stdin Support for Summarize CLI + +--- + +## Testing Patterns + +### Injecting Stdin for Tests +When testing CLI functionality that reads from stdin, always inject the stdin stream through the RunEnv rather than mocking process.stdin globally: + +```typescript +// Good: Injected stdin +type RunEnv = { + env: Record + fetch: typeof fetch + execFile?: ExecFileFn + stdin?: NodeJS.ReadableStream // Add this + stdout: NodeJS.WritableStream + stderr: NodeJS.WritableStream +} + +// Usage in test: +await runCli(['-'], { + env: { HOME: home }, + fetch: vi.fn(), + stdin: createStdinStream('test content'), // Injected + stdout: noopStream(), + stderr: noopStream(), +}) +``` + +### Creating Test Streams +```typescript +const createStdinStream = (content: string): Readable => { + return Readable.from([content]) +} + +const noopStream = () => + new Writable({ + write(chunk, encoding, callback) { + void chunk + void encoding + callback() + }, + }) +``` + +--- + +## Code Review Integration + +### CodeRabbit Workflow +1. **Address actionable comments first** - Critical bugs, security issues, broken functionality +2. **Consider nitpicks carefully** - Some are worth fixing (code clarity), others are stylistic +3. **Verify fixes** - Always rebuild and retest after addressing comments +4. **Commit pattern** - Make separate commits for CodeRabbit fixes to show iteration + +### Common CodeRabbit Patterns +- Import consistency (use `fs.readFile` vs destructured `readFile`) +- Edge case handling (check for file named `-` before treating as stdin) +- Resource cleanup (always use finally blocks for temp files) +- Error message clarity (be specific about what's allowed/not allowed) + +--- + +## Architecture Insights + +### URL Flow vs Asset Flow +The codebase has two distinct processing paths: +- **URL Flow** (`src/run/flows/url/`) - Handles websites, YouTube, podcasts +- **Asset Flow** (`src/run/flows/asset/`) - Handles local files and stdin + +**Key insight:** Markdown converters are created in the URL flow but can be reused in asset flow with proper refactoring. The transcript-to-markdown converter doesn't require URL-specific context. + +### Temp File Strategy +For stdin support, the temp-file approach is clean because: +- Reuses existing `handleFileInput` logic +- Minimal code duplication +- Maintains consistency with file processing +- Easy cleanup in finally blocks + +```typescript +const tempPath = path.join( + os.tmpdir(), + `summarize-stdin-${Date.now()}-${Math.random().toString(36).slice(2, 8)}.txt` +) +try { + await fs.writeFile(tempPath, content, { mode: 0o600 }) + // Process as file... +} finally { + await fs.rm(tempPath, { force: true }).catch(() => {}) +} +``` + +--- + +## Git Workflow for Upstream Contributions + +### Creating PRs to Upstream +**Best practice:** Create upstream PR directly from feature branch without merging to fork main first. + +```bash +# Push feature branch +git push origin feature/stdin-temp-file-support + +# Create PR to upstream from feature branch +gh pr create --repo steipete/summarize \ + --title "feat: add stdin support" \ + --base main \ + --head mvance:feature/stdin-temp-file-support +``` + +**Why this works:** +- Keeps PR open for upstream review +- Clean history +- Can push updates to same branch +- After upstream merge, sync your fork's main + +### Commit Message Conventions +We used conventional commits throughout: +- `feat:` - New features +- `fix:` - Bug fixes +- `docs:` - Documentation updates +- `refactor:` - Code restructuring + +--- + +## Error Handling Best Practices + +### Guard Clause Ordering +**Lesson:** Order matters when stacking guard clauses. We had a bug where a later guard contradicted earlier logic: + +```typescript +// BAD - Third guard rejects file inputs +if (markdownModeExplicitlySet && inputTarget.kind !== 'url') { + throw new Error('Only URLs') +} +if (markdownModeExplicitlySet && inputTarget.kind === 'file' && markdownMode !== 'llm') { + throw new Error('Only llm mode for files') +} +if (markdownModeExplicitlySet && inputTarget.kind !== 'url' && inputTarget.kind !== 'stdin') { + throw new Error('Only URLs') // BUG: rejects files! +} + +// GOOD - Removed redundant third guard +if (markdownModeExplicitlySet && + inputTarget.kind !== 'url' && + inputTarget.kind !== 'file' && + inputTarget.kind !== 'stdin') { + throw new Error('Only URL, file, or stdin') +} +if (markdownModeExplicitlySet && + (inputTarget.kind === 'file' || inputTarget.kind === 'stdin') && + markdownMode !== 'llm') { + throw new Error('Only llm mode') +} +// No third guard needed - covered by first two +``` + +### Error Message Clarity +Bad: `'--markdown-mode is only supported for URL inputs (--markdown-mode llm coming soon)'` + +Good: `'Only --markdown-mode llm is supported for file/stdin inputs; other modes require a URL'` + +**Why:** The first message suggests llm mode isn't supported yet, when it actually is. Be precise about what's allowed vs what's not. + +--- + +## Documentation Tips + +### README Updates +When adding new features: +1. Add clear examples showing the intended use case +2. Remove examples that are antipatterns (e.g., `cat file | summarize -` when direct file path is better) +3. Keep notes consistent with examples + +### Help Text Consistency +- Update both rich help and concise help +- Use consistent formatting +- Avoid awkward syntax like `` - prefer `` with description + +--- + +## Security Considerations + +### Temp File Permissions +Always set restrictive permissions on temp files: +```typescript +await fs.writeFile(tempPath, content, { mode: 0o600 }) +``` + +### Input Size Limits +Prevent OOM with streaming size checks: +```typescript +async function streamToString(stream: NodeJS.ReadableStream, maxBytes: number): Promise { + const chunks: Buffer[] = [] + let totalSize = 0 + for await (const chunk of stream) { + const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk) + totalSize += buffer.length + if (totalSize > maxBytes) { + throw new Error(`Content exceeds ${(maxBytes / 1024 / 1024).toFixed(1)}MB`) + } + chunks.push(buffer) + } + return Buffer.concat(chunks).toString('utf8') +} +``` + +--- + +## Testing Checklist + +Before marking a feature complete: +- [ ] Build passes (`pnpm build`) +- [ ] Linting passes (`pnpm lint`) +- [ ] Unit tests pass (`pnpm test`) +- [ ] Manual testing of core functionality +- [ ] Edge cases tested (empty input, oversized input, etc.) +- [ ] Error messages verified +- [ ] Help text reviewed +- [ ] Documentation updated + +--- + +## Common Pitfalls + +1. **Assuming file checks happen in order** - Always check if path exists before treating `-` as stdin +2. **Forgetting to update all related checks** - When changing input types, update all guards that check `inputTarget.kind` +3. **Redundant validation** - Don't duplicate guard logic; each condition should have a single purpose +4. **Unclear error messages** - Users should immediately understand what's wrong and how to fix it + +--- + +## Useful Commands + +```bash +# Run specific test files +pnpm test tests/cli.stdin.test.ts --run + +# Run multiple test files +pnpm test tests/cli.stdin.test.ts tests/input.resolve-input-target.test.ts --run + +# Build and check +pnpm build && pnpm lint + +# Check git status +git status && git log --oneline -3 + +# View PR comments +gh pr view 3 --repo mvance/summarize --comments +``` + +--- + +## References + +- **Upstream PR:** https://github.com/steipete/summarize/pull/68 +- **Fork PR:** https://github.com/mvance/summarize/pull/3 +- **Feature Branch:** `feature/stdin-temp-file-support` +- **Main Files Changed:** + - `src/content/asset.ts` - InputTarget type and resolution + - `src/run/runner.ts` - Stdin handling logic + - `src/run/help.ts` - Help text updates + - `tests/cli.stdin.test.ts` - New test suite + - `tests/input.resolve-input-target.test.ts` - Stdin resolution tests + - `README.md` - Documentation diff --git a/README.md b/README.md index bd9cd219..af411915 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,18 @@ summarize "/path/to/audio.mp3" summarize "/path/to/video.mp4" ``` +Stdin (pipe content using `-`): + +```bash +echo "content" | summarize - +pbpaste | summarize - +``` + +**Notes:** +- Stdin has a 50MB size limit +- The `-` argument tells summarize to read from standard input +- Useful for piping clipboard content or other command output + YouTube (supports `youtube.com` and `youtu.be`): ```bash diff --git a/src/content/asset.ts b/src/content/asset.ts index dfda4673..6a652f7c 100644 --- a/src/content/asset.ts +++ b/src/content/asset.ts @@ -7,7 +7,10 @@ import mime from 'mime' import { userTextAndImageMessage } from '../llm/prompt.js' -export type InputTarget = { kind: 'url'; url: string } | { kind: 'file'; filePath: string } +export type InputTarget = + | { kind: 'url'; url: string } + | { kind: 'file'; filePath: string } + | { kind: 'stdin' } export type UrlKind = { kind: 'website' } | { kind: 'asset' } @@ -119,6 +122,10 @@ export function resolveInputTarget(raw: string): InputTarget { return { kind: 'file', filePath: asPath } } + if (normalized === '-') { + return { kind: 'stdin' } + } + const extractedUrls = extractHttpUrlsFromText(normalized) const extractedLast = extractedUrls.at(-1) ?? null if (extractedLast && extractedLast !== normalized) { diff --git a/src/run/help.ts b/src/run/help.ts index 6c348b99..01cb04b3 100644 --- a/src/run/help.ts +++ b/src/run/help.ts @@ -12,7 +12,7 @@ export function buildProgram() { return new Command() .name('summarize') .description('Summarize web pages and YouTube links (uses direct provider API keys).') - .argument('[input]', 'URL or local file path to summarize') + .argument('[input]', 'URL, local file path, or - for stdin to summarize') .option( '--youtube ', 'YouTube transcript source: auto, web, no-auto (skip auto-generated captions), yt-dlp, apify', @@ -244,6 +244,7 @@ ${heading('Examples')} ${cmd('summarize "https://example.com" --length 20k --max-output-tokens 2k --timeout 2m --model openai/gpt-5-mini')} ${cmd('summarize "https://example.com" --model mymodel')} ${dim('# config preset')} ${cmd('summarize "https://example.com" --json --verbose')} + ${cmd('pbpaste | summarize -')} ${dim('# summarize clipboard content')} ${heading('Env Vars')} XAI_API_KEY optional (required for xai/... models) @@ -287,11 +288,12 @@ export function buildConciseHelp(): string { return [ 'summarize - Summarize web pages, files, and YouTube links.', '', - 'Usage: summarize [flags]', + 'Usage: summarize [flags]', '', 'Examples:', ' summarize "https://example.com"', ' summarize "/path/to/file.pdf" --model google/gemini-3-flash-preview', + ' pbpaste | summarize -', '', 'Run summarize --help for full options.', `Support: ${SUPPORT_URL}`, diff --git a/src/run/runner.ts b/src/run/runner.ts index 9b56658c..14795f55 100644 --- a/src/run/runner.ts +++ b/src/run/runner.ts @@ -1,5 +1,7 @@ import { execFile } from 'node:child_process' -import { readFile } from 'node:fs/promises' +import fs from 'node:fs/promises' +import os from 'node:os' +import path from 'node:path' import { CommanderError } from 'commander' import { type CacheState, @@ -8,6 +10,7 @@ import { resolveCachePath, } from '../cache.js' import { loadSummarizeConfig } from '../config.js' +import type { InputTarget } from '../content/asset.js' import { parseExtractFormat, parseMaxExtractCharactersArg, @@ -47,17 +50,34 @@ import { createSummaryEngine } from './summary-engine.js' import { isRichTty, supportsColor } from './terminal.js' import { handleTranscriberCliRequest } from './transcriber-cli.js' +async function streamToString(stream: NodeJS.ReadableStream, maxBytes: number): Promise { + const chunks: Buffer[] = [] + let totalSize = 0 + for await (const chunk of stream) { + const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk) + totalSize += buffer.length + if (totalSize > maxBytes) { + throw new Error( + `Stdin content exceeds maximum size of ${(maxBytes / 1024 / 1024).toFixed(1)}MB` + ) + } + chunks.push(buffer) + } + return Buffer.concat(chunks).toString('utf8') +} + type RunEnv = { env: Record fetch: typeof fetch execFile?: ExecFileFn + stdin?: NodeJS.ReadableStream stdout: NodeJS.WritableStream stderr: NodeJS.WritableStream } export async function runCli( argv: string[], - { env, fetch, execFile: execFileOverride, stdout, stderr }: RunEnv + { env, fetch, execFile: execFileOverride, stdin, stdout, stderr }: RunEnv ): Promise { ;(globalThis as unknown as { AI_SDK_LOG_WARNINGS?: boolean }).AI_SDK_LOG_WARNINGS = false @@ -148,7 +168,7 @@ export async function runCli( if (promptFileArg) { let text: string try { - text = await readFile(promptFileArg, 'utf8') + text = await fs.readFile(promptFileArg, 'utf8') } catch (error) { const message = error instanceof Error ? error.message : String(error) throw new Error(`Failed to read --prompt-file ${promptFileArg}: ${message}`) @@ -496,8 +516,22 @@ export async function runCli( if (markdownModeExplicitlySet && format !== 'markdown') { throw new Error('--markdown-mode is only supported with --format md') } - if (markdownModeExplicitlySet && inputTarget.kind !== 'url') { - throw new Error('--markdown-mode is only supported for URL inputs') + if ( + markdownModeExplicitlySet && + inputTarget.kind !== 'url' && + inputTarget.kind !== 'file' && + inputTarget.kind !== 'stdin' + ) { + throw new Error('--markdown-mode is only supported for URL or file inputs') + } + if ( + markdownModeExplicitlySet && + (inputTarget.kind === 'file' || inputTarget.kind === 'stdin') && + markdownMode !== 'llm' + ) { + throw new Error( + 'Only --markdown-mode llm is supported for file/stdin inputs; other modes require a URL' + ) } const metrics = createRunMetrics({ env, @@ -551,6 +585,9 @@ export async function runCli( }) if (extractMode && inputTarget.kind !== 'url') { + if (inputTarget.kind === 'stdin') { + throw new Error('--extract is not supported for piped stdin input') + } throw new Error('--extract is only supported for website/YouTube URLs') } @@ -701,6 +738,28 @@ export async function runCli( clearProgressIfCurrent, } + if (inputTarget.kind === 'stdin') { + const tempPath = path.join( + os.tmpdir(), + `summarize-stdin-${Date.now()}-${Math.random().toString(36).slice(2, 8)}.txt` + ) + const MAX_STDIN_BYTES = 50 * 1024 * 1024 // 50MB limit + try { + const stdinContent = await streamToString(stdin ?? process.stdin, MAX_STDIN_BYTES) + if (!stdinContent.trim()) { + throw new Error('Stdin is empty') + } + await fs.writeFile(tempPath, stdinContent, { mode: 0o600 }) + const stdinInputTarget: InputTarget = { kind: 'file', filePath: tempPath } + if (await handleFileInput(assetInputContext, stdinInputTarget)) { + return + } + throw new Error('Failed to process stdin input') + } finally { + await fs.rm(tempPath, { force: true }).catch(() => {}) + } + } + if (await handleFileInput(assetInputContext, inputTarget)) { return } diff --git a/tests/cli.run.arg-branches.test.ts b/tests/cli.run.arg-branches.test.ts index 1916f654..25279692 100644 --- a/tests/cli.run.arg-branches.test.ts +++ b/tests/cli.run.arg-branches.test.ts @@ -100,7 +100,7 @@ describe('cli run.ts arg parsing branches', () => { stdout: stdout.stream, stderr: stderr.stream, }) - ).rejects.toThrow(/Usage: summarize /) + ).rejects.toThrow(/Usage: summarize /) }) it('--debug defaults --metrics to detailed', async () => { diff --git a/tests/cli.stdin.test.ts b/tests/cli.stdin.test.ts new file mode 100644 index 00000000..a00f29fc --- /dev/null +++ b/tests/cli.stdin.test.ts @@ -0,0 +1,122 @@ +import { mkdtempSync, rmSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { Readable, Writable } from 'node:stream' +import { afterAll, describe, expect, it, vi } from 'vitest' + +import { runCli } from '../src/run.js' + +const noopStream = () => + new Writable({ + write(chunk, encoding, callback) { + void chunk + void encoding + callback() + }, + }) + +const createStdinStream = (content: string): Readable => { + return Readable.from([content]) +} + +describe('cli stdin support', () => { + const home = mkdtempSync(join(tmpdir(), 'summarize-tests-stdin-')) + + afterAll(() => { + rmSync(home, { recursive: true, force: true }) + }) + + it('errors on empty stdin', async () => { + await expect( + runCli(['-'], { + env: { HOME: home }, + fetch: vi.fn() as unknown as typeof fetch, + stdin: createStdinStream(' '), // Whitespace only + stdout: noopStream(), + stderr: noopStream(), + }) + ).rejects.toThrow('Stdin is empty') + }) + + it('errors on completely empty stdin', async () => { + await expect( + runCli(['-'], { + env: { HOME: home }, + fetch: vi.fn() as unknown as typeof fetch, + stdin: createStdinStream(''), // Completely empty + stdout: noopStream(), + stderr: noopStream(), + }) + ).rejects.toThrow('Stdin is empty') + }) + + it('errors on --extract with stdin', async () => { + const testContent = 'This is a test document for extraction.' + + await expect( + runCli(['--extract', '-'], { + env: { HOME: home }, + fetch: vi.fn() as unknown as typeof fetch, + stdin: createStdinStream(testContent), + stdout: noopStream(), + stderr: noopStream(), + }) + ).rejects.toThrow('--extract is not supported for piped stdin input') + }) + + it('allows --markdown-mode llm for stdin (transcript formatting coming soon)', async () => { + // This test verifies that --markdown-mode llm is allowed for stdin + // (actual transcript formatting will be implemented in a future update) + const testContent = 'Test content for markdown mode.' + + try { + await runCli(['--format', 'md', '--markdown-mode', 'llm', '-'], { + env: { HOME: home }, + fetch: vi.fn() as unknown as typeof fetch, + stdin: createStdinStream(testContent), + stdout: noopStream(), + stderr: noopStream(), + }) + // If it succeeds, that's fine - --markdown-mode llm is allowed + } catch (error) { + // If it throws, make sure it's NOT a restriction error + const message = error instanceof Error ? error.message : String(error) + expect(message).not.toMatch(/--markdown-mode is only supported/) + } + }) + + it('rejects --markdown-mode readability for stdin', async () => { + // Only --markdown-mode llm is allowed for stdin (other modes need URL context) + const testContent = 'Test content.' + + await expect( + runCli(['--format', 'md', '--markdown-mode', 'readability', '-'], { + env: { HOME: home }, + fetch: vi.fn() as unknown as typeof fetch, + stdin: createStdinStream(testContent), + stdout: noopStream(), + stderr: noopStream(), + }) + ).rejects.toThrow('Only --markdown-mode llm is supported for file/stdin inputs') + }) + + it('processes stdin correctly for non-extract mode', async () => { + // This test verifies that stdin is processed and doesn't fail with stdin-related errors + const testContent = 'Test content for basic processing.' + + try { + await runCli(['-'], { + env: { HOME: home }, + fetch: vi.fn() as unknown as typeof fetch, + stdin: createStdinStream(testContent), + stdout: noopStream(), + stderr: noopStream(), + }) + // If it succeeds, that's fine - stdin was processed correctly + } catch (error) { + // If it throws, make sure it's NOT a stdin-related error + const message = error instanceof Error ? error.message : String(error) + expect(message).not.toMatch(/Stdin is empty/) + } + }) +}) diff --git a/tests/input.resolve-input-target.test.ts b/tests/input.resolve-input-target.test.ts index c071ac2a..98ceeac7 100644 --- a/tests/input.resolve-input-target.test.ts +++ b/tests/input.resolve-input-target.test.ts @@ -81,6 +81,19 @@ describe('resolveInputTarget', () => { url: 'https://en.wikipedia.org/wiki/Set_(mathematics)', }) }) + + it('resolves - to stdin input', () => { + expect(resolveInputTarget('-')).toEqual({ + kind: 'stdin', + }) + }) + + it('resolves - with whitespace to stdin input', () => { + expect(resolveInputTarget(' - ')).toEqual({ + kind: 'stdin', + }) + }) + it('throws when neither file nor URL can be resolved', () => { expect(() => resolveInputTarget('not a url')).toThrow(/Invalid URL or file path/i) })