diff --git a/.lenv b/.lenv
new file mode 100644
index 0000000..a8d6d9c
--- /dev/null
+++ b/.lenv
@@ -0,0 +1,8 @@
+# web-capture default configuration
+# This file uses Links Notation format (key: value)
+
+# Server port
+PORT: 3000
+
+# Browser engine (puppeteer or playwright)
+BROWSER_ENGINE: puppeteer
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 3215655..916454d 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -515,7 +515,7 @@ RUN yarn install --frozen-lockfile
COPY . .
EXPOSE 3000
-ENTRYPOINT ["node", "src/index.js"]
+ENTRYPOINT ["node", "bin/web-capture.js", "--serve"]
```
### Docker Compose
diff --git a/Dockerfile b/Dockerfile
index 839f259..440a15d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -42,4 +42,4 @@ COPY . .
EXPOSE 3000
-ENTRYPOINT ["node", "src/index.js"]
+ENTRYPOINT ["node", "bin/web-capture.js", "--serve"]
diff --git a/README.md b/README.md
index eeed61e..d63cdc8 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,37 @@
-A microservice to fetch URLs and render them as:
+A CLI and microservice to fetch URLs and render them as:
+
+- **HTML**: Rendered page content
+- **Markdown**: Converted from HTML
+- **PNG screenshot**: Full page capture
+
+## Quick Start
+
+### CLI Usage
+
+```bash
+# Install globally
+npm install -g web-capture
+
+# Capture a URL as HTML (output to stdout)
+web-capture https://example.com
+
+# Capture as Markdown and save to file
+web-capture https://example.com --format markdown --output page.md
+
+# Take a screenshot
+web-capture https://example.com --format png --output screenshot.png
+
+# Start as API server
+web-capture --serve
+
+# Start server on custom port
+web-capture --serve --port 8080
+```
+
+### API Endpoints (Server Mode)
- **HTML**: GET /html?url=
- **Markdown**: GET /markdown?url=
@@ -16,6 +46,51 @@ npm install
yarn install
```
+## CLI Reference
+
+### Server Mode
+
+Start the API server:
+
+```bash
+web-capture --serve [--port ]
+```
+
+| Option | Short | Description | Default |
+|--------|-------|-------------|---------|
+| `--serve` | `-s` | Start as HTTP API server | - |
+| `--port` | `-p` | Port to listen on | 3000 (or PORT env) |
+
+### Capture Mode
+
+Capture a URL directly:
+
+```bash
+web-capture [options]
+```
+
+| Option | Short | Description | Default |
+|--------|-------|-------------|---------|
+| `--format` | `-f` | Output format: `html`, `markdown`/`md`, `image`/`png` | `html` |
+| `--output` | `-o` | Output file path | stdout (text) or auto-generated (images) |
+| `--engine` | `-e` | Browser engine: `puppeteer`, `playwright` | `puppeteer` (or BROWSER_ENGINE env) |
+
+### Examples
+
+```bash
+# Capture HTML to stdout
+web-capture https://example.com
+
+# Capture Markdown to file
+web-capture https://example.com -f markdown -o page.md
+
+# Take screenshot with Playwright engine
+web-capture https://example.com -f png -e playwright -o screenshot.png
+
+# Pipe HTML to another command
+web-capture https://example.com | grep "title"
+```
+
## Available Commands
### Development
@@ -101,6 +176,50 @@ curl http://localhost:3000/image?url=https://example.com > screenshot.png
curl http://localhost:3000/image?url=https://example.com&engine=playwright > screenshot.png
```
+## Configuration
+
+web-capture uses [lino-arguments](https://github.com/link-foundation/lino-arguments) for unified configuration management. Configuration values are resolved with the following priority (highest to lowest):
+
+1. **CLI arguments**: `--port 8080`
+2. **Environment variables**: `PORT=8080`
+3. **Custom configuration file**: `--configuration path/to/custom.lenv`
+4. **Default .lenv file**: `.lenv` in the project root
+5. **Built-in defaults**
+
+### Configuration File (.lenv)
+
+Create a `.lenv` file in your project root using Links Notation format:
+
+```lenv
+# Server configuration
+PORT: 3000
+
+# Browser engine (puppeteer or playwright)
+BROWSER_ENGINE: puppeteer
+```
+
+### Using Custom Configuration Files
+
+Specify a custom configuration file path:
+
+```bash
+web-capture --serve --configuration /path/to/custom.lenv
+```
+
+### Environment Variables
+
+All configuration options support environment variables:
+
+```bash
+# Set port via environment variable
+export PORT=8080
+web-capture --serve
+
+# Set browser engine
+export BROWSER_ENGINE=playwright
+web-capture https://example.com --format png
+```
+
## Browser Engine Support
The service supports both **Puppeteer** and **Playwright** browser engines:
@@ -108,17 +227,15 @@ The service supports both **Puppeteer** and **Playwright** browser engines:
- **Puppeteer**: Default engine, mature and well-tested
- **Playwright**: Alternative engine with similar capabilities
-You can choose the engine using the `engine` query parameter or by setting the `BROWSER_ENGINE` environment variable.
+You can choose the engine using:
+- CLI argument: `--engine playwright`
+- Environment variable: `BROWSER_ENGINE=playwright`
+- Configuration file: `BROWSER_ENGINE: playwright` in `.lenv`
**Supported engine values:**
- `puppeteer` or `pptr` - Use Puppeteer
- `playwright` or `pw` - Use Playwright
-**Environment Variable:**
-```bash
-export BROWSER_ENGINE=playwright
-```
-
## Development
The service is built with:
diff --git a/bin/web-capture.js b/bin/web-capture.js
new file mode 100755
index 0000000..12a4330
--- /dev/null
+++ b/bin/web-capture.js
@@ -0,0 +1,256 @@
+#!/usr/bin/env node
+// CLI entry point for web-capture
+// Supports two modes:
+// 1. Server mode: web-capture --serve [--port 3000]
+// 2. Capture mode: web-capture [options]
+
+import { fileURLToPath } from 'url';
+import { dirname, resolve } from 'path';
+import fs from 'fs';
+import { URL } from 'url';
+import { makeConfig } from 'lino-arguments';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+// Create configuration using lino-arguments pattern
+const config = makeConfig({
+ yargs: ({ yargs, getenv }) => {
+ return yargs
+ .usage('web-capture - Capture web pages as HTML, Markdown, or PNG\n\nUsage:\n web-capture --serve [--port ] Start as API server\n web-capture [options] Capture a URL to file/stdout')
+ .option('serve', {
+ alias: 's',
+ type: 'boolean',
+ description: 'Start as HTTP API server',
+ default: false
+ })
+ .option('port', {
+ alias: 'p',
+ type: 'number',
+ description: 'Port to listen on (default: 3000, or PORT env)',
+ default: getenv('PORT', 3000)
+ })
+ .option('format', {
+ alias: 'f',
+ type: 'string',
+ description: 'Output format: html, markdown, md, image, png',
+ default: 'html'
+ })
+ .option('output', {
+ alias: 'o',
+ type: 'string',
+ description: 'Output file path (default: stdout for text, auto-generated for images)'
+ })
+ .option('engine', {
+ alias: 'e',
+ type: 'string',
+ description: 'Browser engine: puppeteer, playwright',
+ default: getenv('BROWSER_ENGINE', 'puppeteer')
+ })
+ .option('configuration', {
+ type: 'string',
+ description: 'Path to .lenv configuration file'
+ })
+ .help('help')
+ .alias('help', 'h')
+ .version()
+ .alias('version', 'v')
+ .example('web-capture --serve', 'Start API server on port 3000')
+ .example('web-capture --serve --port 8080', 'Start API server on custom port')
+ .example('web-capture https://example.com', 'Capture URL as HTML to stdout')
+ .example('web-capture https://example.com --format markdown --output page.md', 'Capture URL as Markdown to file')
+ .example('web-capture https://example.com --format png --engine playwright -o screenshot.png', 'Capture screenshot using Playwright')
+ .epilogue('API Endpoints (in server mode):\n GET /html?url=&engine= Get rendered HTML\n GET /markdown?url= Get Markdown conversion\n GET /image?url=&engine= Get PNG screenshot\n GET /fetch?url= Proxy fetch\n GET /stream?url= Streaming proxy')
+ .strict();
+ },
+ lenv: {
+ enabled: true,
+ path: '.lenv'
+ }
+});
+
+async function startServer(port) {
+ // Import the Express app
+ const { app } = await import('../src/index.js');
+
+ return new Promise((resolve, reject) => {
+ const server = app.listen(port, () => {
+ console.log(`web-capture server listening on http://localhost:${port}`);
+ console.log('');
+ console.log('Available endpoints:');
+ console.log(` GET /html?url= - Render page as HTML`);
+ console.log(` GET /markdown?url= - Convert page to Markdown`);
+ console.log(` GET /image?url= - Screenshot page as PNG`);
+ console.log(` GET /fetch?url= - Proxy fetch content`);
+ console.log(` GET /stream?url= - Stream content`);
+ console.log('');
+ console.log('Press Ctrl+C to stop the server');
+ resolve(server);
+ });
+
+ server.on('error', reject);
+
+ // Handle graceful shutdown
+ function shutdown(signal) {
+ console.log(`\nReceived ${signal}, shutting down...`);
+ server.close(() => {
+ console.log('Server closed');
+ process.exit(0);
+ });
+ // Force exit if not closed in 2 seconds
+ setTimeout(() => {
+ console.error('Force exiting after 2s');
+ process.exit(1);
+ }, 2000);
+ }
+
+ process.on('SIGTERM', () => shutdown('SIGTERM'));
+ process.on('SIGINT', () => shutdown('SIGINT'));
+ });
+}
+
+async function captureUrl(url, options) {
+ const { format, output, engine } = options;
+
+ // Ensure URL is absolute
+ let absoluteUrl = url;
+ if (!url.startsWith('http://') && !url.startsWith('https://')) {
+ absoluteUrl = `https://${url}`;
+ }
+
+ // Validate URL
+ try {
+ new URL(absoluteUrl);
+ } catch (err) {
+ console.error(`Error: Invalid URL "${url}"`);
+ process.exit(1);
+ }
+
+ // Import required modules
+ const { fetchHtml, convertHtmlToMarkdown, convertToUtf8, convertRelativeUrls } = await import('../src/lib.js');
+ const { createBrowser } = await import('../src/browser.js');
+
+ const normalizedFormat = format.toLowerCase();
+
+ try {
+ if (normalizedFormat === 'markdown' || normalizedFormat === 'md') {
+ // Markdown format
+ const html = await fetchHtml(absoluteUrl);
+ const markdown = convertHtmlToMarkdown(html, absoluteUrl);
+
+ if (output) {
+ fs.writeFileSync(output, markdown, 'utf-8');
+ console.error(`Markdown saved to: ${output}`);
+ } else {
+ process.stdout.write(markdown);
+ }
+ } else if (normalizedFormat === 'image' || normalizedFormat === 'png' || normalizedFormat === 'screenshot') {
+ // Image/screenshot format
+ const browser = await createBrowser(engine);
+ try {
+ const page = await browser.newPage();
+ await page.setExtraHTTPHeaders({
+ 'Accept-Language': 'en-US,en;q=0.9',
+ 'Accept-Charset': 'utf-8'
+ });
+ await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
+ await page.setViewport({ width: 1280, height: 800 });
+ await page.goto(absoluteUrl, {
+ waitUntil: 'networkidle0',
+ timeout: 30000
+ });
+ // Wait for 5 seconds after page load
+ await new Promise(resolve => setTimeout(resolve, 5000));
+
+ const buffer = await page.screenshot({ type: 'png' });
+
+ if (output) {
+ fs.writeFileSync(output, buffer);
+ console.error(`Screenshot saved to: ${output}`);
+ } else {
+ // Generate default filename based on URL
+ const urlObj = new URL(absoluteUrl);
+ const defaultFilename = `${urlObj.hostname.replace(/\./g, '_')}_${Date.now()}.png`;
+ fs.writeFileSync(defaultFilename, buffer);
+ console.error(`Screenshot saved to: ${defaultFilename}`);
+ }
+ } finally {
+ await browser.close();
+ }
+ } else {
+ // HTML format (default)
+ const html = await fetchHtml(absoluteUrl);
+ const hasJavaScript = /