diff --git a/.npmignore b/.npmignore new file mode 100644 index 00000000..8a21bf23 --- /dev/null +++ b/.npmignore @@ -0,0 +1,12 @@ +apps/ +packages/ +docs/ +node_modules/ +bun.lock +.git/ +.gitignore +CLAUDE.md +tsconfig.json +biome.json +*.ts +*.tsx \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..c3748bbb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,53 @@ +# Multi-stage build for ccflare +FROM oven/bun:1-alpine AS builder + +WORKDIR /app + +# Copy package files for dependency caching +COPY package.json bun.lock* ./ + +# Copy all source code (required for workspace dependencies) +COPY . . + +# Install dependencies +RUN bun install --frozen-lockfile + +# Build the project +RUN bun run build + +# Production stage +FROM oven/bun:1-alpine AS runner + +WORKDIR /app + +# Install SQLite tools for database repair and debugging +RUN apk add --no-cache sqlite + +# Create non-root user +RUN addgroup -g 1001 -S ccflare && \ + adduser -S ccflare -u 1001 -G ccflare + +# Copy built application +COPY --from=builder --chown=ccflare:ccflare /app . + +# Copy repair scripts +COPY --chown=ccflare:ccflare scripts/ /app/scripts/ +RUN find /app/scripts -name '*.sh' -type f -exec chmod +x {} + 2>/dev/null || true + +# Create data directory for SQLite database +RUN mkdir -p /app/data && chown ccflare:ccflare /app/data + +# Switch to non-root user +USER ccflare + +# Set API key for authentication (change this in production!) +ENV API_KEY=ccflare-default-key + +# Set database path to persistent volume mount +ENV ccflare_DB_PATH=/app/data/ccflare.db + +# Expose port +EXPOSE 8080 + +# Start the server (not TUI) +CMD ["bun", "run", "server"] \ No newline at end of file diff --git a/apps/server/src/server.ts b/apps/server/src/server.ts index 7c933690..707d94fd 100644 --- a/apps/server/src/server.ts +++ b/apps/server/src/server.ts @@ -196,24 +196,65 @@ export default function startServer(options?: { return apiResponse; } + // Check API key for auth protection + const apiKey = process.env.API_KEY; + // Dashboard routes (only if enabled) if (withDashboard) { - if (url.pathname === "/" || url.pathname === "/dashboard") { + // Dashboard routes with API key protection + if (url.pathname === "/" || url.pathname === "/dashboard" || + (apiKey && url.pathname === `/${apiKey}/`)) { + + // If API key is required, only allow /{key}/ access + if (apiKey && url.pathname !== `/${apiKey}/`) { + return new Response("Not Found", { status: HTTP_STATUS.NOT_FOUND }); + } + return serveDashboardFile("/index.html", "text/html"); } - // Serve dashboard static assets - if ((dashboardManifest as Record)[url.pathname]) { + // Serve dashboard static assets with auth protection + let assetPathname = url.pathname; + let isAuthenticatedAssetRequest = false; + + // If API key is set, check for auth-prefixed asset paths + if (apiKey && url.pathname.startsWith(`/${apiKey}/`)) { + // Strip the key prefix for asset lookup + assetPathname = url.pathname.substring(`/${apiKey}`.length); + isAuthenticatedAssetRequest = true; + } + + if ((dashboardManifest as Record)[assetPathname]) { + // If API key is required but request is not authenticated, block access + if (apiKey && !isAuthenticatedAssetRequest) { + return new Response("Not Found", { status: HTTP_STATUS.NOT_FOUND }); + } + return serveDashboardFile( - url.pathname, + assetPathname, undefined, CACHE.CACHE_CONTROL_STATIC, ); } } - // All other paths go to proxy - return handleProxy(req, url, proxyContext); + // Handle API authentication and proxying + if (apiKey) { + // Auth required - check for /key/v1/ format + const pathParts = url.pathname.split('/').filter(Boolean); + if (pathParts[0] === apiKey && pathParts[1] === 'v1') { + // Valid auth - rewrite path and proxy + url.pathname = '/' + pathParts.slice(1).join('/'); + return handleProxy(req, url, proxyContext); + } + return new Response("Not Found", { status: HTTP_STATUS.NOT_FOUND }); + } else { + // No auth required - allow direct /v1/ access + if (!url.pathname.startsWith("/v1/")) { + return new Response("Not Found", { status: HTTP_STATUS.NOT_FOUND }); + } + return handleProxy(req, url, proxyContext); + } }, }); diff --git a/deploy/k8-yaml/k8s-deployment.yaml b/deploy/k8-yaml/k8s-deployment.yaml new file mode 100644 index 00000000..f7fc9562 --- /dev/null +++ b/deploy/k8-yaml/k8s-deployment.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ccflare-data + namespace: coder +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 10Gi + storageClassName: ceph-filesystem +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ccflare + namespace: coder + labels: + app: ccflare +spec: + replicas: 1 + selector: + matchLabels: + app: ccflare + template: + metadata: + labels: + app: ccflare + spec: + securityContext: + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + containers: + - name: ccflare + image: 192.168.96.61:30009/library/ccflare-fork:latest + ports: + - containerPort: 8080 + volumeMounts: + - name: ccflare-data + mountPath: /app/data + volumes: + - name: ccflare-data + persistentVolumeClaim: + claimName: ccflare-data +--- +apiVersion: v1 +kind: Service +metadata: + name: ccflare-service + namespace: coder +spec: + selector: + app: ccflare + ports: + - port: 8080 + targetPort: 8080 + type: ClusterIP \ No newline at end of file diff --git a/index.js b/index.js new file mode 100644 index 00000000..b8876970 --- /dev/null +++ b/index.js @@ -0,0 +1,3 @@ +// ccflare - Claude load balancer proxy +// Placeholder package - implementation coming soon +module.exports = {}; \ No newline at end of file diff --git a/packages/config/src/index.ts b/packages/config/src/index.ts index 7f288bff..8f679dbd 100644 --- a/packages/config/src/index.ts +++ b/packages/config/src/index.ts @@ -8,6 +8,9 @@ import { NETWORK, type StrategyName, TIME_CONSTANTS, + validateNumber, + validateString, + ValidationError, } from "@ccflare/core"; import { Logger } from "@ccflare/logger"; import { resolveConfigPath } from "./paths"; @@ -19,6 +22,19 @@ export interface RuntimeConfig { retry: { attempts: number; delayMs: number; backoff: number }; sessionDurationMs: number; port: number; + database?: { + walMode?: boolean; + busyTimeoutMs?: number; + cacheSize?: number; + synchronous?: 'OFF' | 'NORMAL' | 'FULL'; + mmapSize?: number; + retry?: { + attempts?: number; + delayMs?: number; + backoff?: number; + maxDelayMs?: number; + }; + }; } export interface ConfigData { @@ -29,10 +45,105 @@ export interface ConfigData { retry_backoff?: number; session_duration_ms?: number; port?: number; + // Database configuration + db_wal_mode?: boolean; + db_busy_timeout_ms?: number; + db_cache_size?: number; + db_synchronous?: 'OFF' | 'NORMAL' | 'FULL'; + db_mmap_size?: number; + db_retry_attempts?: number; + db_retry_delay_ms?: number; + db_retry_backoff?: number; + db_retry_max_delay_ms?: number; default_agent_model?: string; [key: string]: string | number | boolean | undefined; } +/** + * Validates database configuration parameters + */ +function validateDatabaseConfig(config: Partial): void { + if (!config) return; + + // Validate synchronous mode + if (config.synchronous !== undefined) { + validateString(config.synchronous, 'db_synchronous', { + allowedValues: ['OFF', 'NORMAL', 'FULL'] + }); + } + + // Validate numeric parameters with reasonable bounds + if (config.busyTimeoutMs !== undefined) { + validateNumber(config.busyTimeoutMs, 'db_busy_timeout_ms', { + min: 0, + max: 300000, // 5 minutes max + integer: true + }); + } + + if (config.cacheSize !== undefined) { + validateNumber(config.cacheSize, 'db_cache_size', { + min: -2000000, // -2GB max negative (KB) + max: 1000000, // 1M pages max positive + integer: true + }); + } + + if (config.mmapSize !== undefined) { + validateNumber(config.mmapSize, 'db_mmap_size', { + min: 0, + max: 1073741824, // 1GB max + integer: true + }); + } + + // Validate retry configuration consistency + if (config.retry) { + const retry = config.retry; + + if (retry.attempts !== undefined) { + validateNumber(retry.attempts, 'db_retry_attempts', { + min: 1, + max: 10, + integer: true + }); + } + + if (retry.delayMs !== undefined) { + validateNumber(retry.delayMs, 'db_retry_delay_ms', { + min: 1, + max: 60000, // 1 minute max + integer: true + }); + } + + if (retry.backoff !== undefined) { + validateNumber(retry.backoff, 'db_retry_backoff', { + min: 1, + max: 10 + }); + } + + if (retry.maxDelayMs !== undefined) { + validateNumber(retry.maxDelayMs, 'db_retry_max_delay_ms', { + min: 1, + max: 300000, // 5 minutes max + integer: true + }); + } + + // Ensure maxDelayMs is greater than delayMs if both are specified + if (retry.delayMs !== undefined && retry.maxDelayMs !== undefined) { + if (retry.maxDelayMs < retry.delayMs) { + throw new ValidationError( + 'db_retry_max_delay_ms must be greater than or equal to db_retry_delay_ms', + 'db_retry_max_delay_ms' + ); + } + } + } +} + export class Config extends EventEmitter { private configPath: string; private data: ConfigData = {}; @@ -163,6 +274,19 @@ export class Config extends EventEmitter { }, sessionDurationMs: TIME_CONSTANTS.SESSION_DURATION_DEFAULT, port: NETWORK.DEFAULT_PORT, + database: { + walMode: true, + busyTimeoutMs: 5000, + cacheSize: -20000, // 20MB cache + synchronous: 'NORMAL', + mmapSize: 268435456, // 256MB + retry: { + attempts: 3, + delayMs: 100, + backoff: 2, + maxDelayMs: 5000, + }, + }, }; // Override with environment variables if present @@ -205,6 +329,73 @@ export class Config extends EventEmitter { defaults.port = this.data.port; } + // Database configuration overrides + // Ensure database configuration object exists + if (!defaults.database) { + defaults.database = { + walMode: true, + busyTimeoutMs: 5000, + cacheSize: -20000, + synchronous: 'NORMAL', + mmapSize: 268435456, + retry: { + attempts: 3, + delayMs: 100, + backoff: 2, + maxDelayMs: 5000, + }, + }; + } + + // Ensure retry configuration object exists + if (!defaults.database.retry) { + defaults.database.retry = { + attempts: 3, + delayMs: 100, + backoff: 2, + maxDelayMs: 5000, + }; + } + + if (typeof this.data.db_wal_mode === "boolean") { + defaults.database.walMode = this.data.db_wal_mode; + } + if (typeof this.data.db_busy_timeout_ms === "number") { + defaults.database.busyTimeoutMs = this.data.db_busy_timeout_ms; + } + if (typeof this.data.db_cache_size === "number") { + defaults.database.cacheSize = this.data.db_cache_size; + } + if (typeof this.data.db_synchronous === "string") { + defaults.database.synchronous = this.data.db_synchronous as 'OFF' | 'NORMAL' | 'FULL'; + } + if (typeof this.data.db_mmap_size === "number") { + defaults.database.mmapSize = this.data.db_mmap_size; + } + if (typeof this.data.db_retry_attempts === "number") { + defaults.database.retry.attempts = this.data.db_retry_attempts; + } + if (typeof this.data.db_retry_delay_ms === "number") { + defaults.database.retry.delayMs = this.data.db_retry_delay_ms; + } + if (typeof this.data.db_retry_backoff === "number") { + defaults.database.retry.backoff = this.data.db_retry_backoff; + } + if (typeof this.data.db_retry_max_delay_ms === "number") { + defaults.database.retry.maxDelayMs = this.data.db_retry_max_delay_ms; + } + + // Validate the final database configuration + try { + validateDatabaseConfig(defaults.database); + } catch (error) { + if (error instanceof ValidationError) { + log.error(`Database configuration validation failed: ${error.message}`); + throw error; + } + throw error; + } + return defaults; } } diff --git a/packages/database/src/database-operations.ts b/packages/database/src/database-operations.ts index 2db7aaa0..1abc53e6 100644 --- a/packages/database/src/database-operations.ts +++ b/packages/database/src/database-operations.ts @@ -2,6 +2,7 @@ import { Database } from "bun:sqlite"; import { mkdirSync } from "node:fs"; import { dirname } from "node:path"; import type { Disposable } from "@ccflare/core"; +import type { RuntimeConfig } from "@ccflare/config"; import type { Account, StrategyStore } from "@ccflare/types"; import { ensureSchema, runMigrations } from "./migrations"; import { resolveDbPath } from "./paths"; @@ -14,9 +15,95 @@ import { } from "./repositories/request.repository"; import { StatsRepository } from "./repositories/stats.repository"; import { StrategyRepository } from "./repositories/strategy.repository"; +import { withDatabaseRetrySync } from "./retry"; + +export interface DatabaseConfig { + /** Enable WAL (Write-Ahead Logging) mode for better concurrency */ + walMode?: boolean; + /** SQLite busy timeout in milliseconds */ + busyTimeoutMs?: number; + /** Cache size in pages (negative value = KB) */ + cacheSize?: number; + /** Synchronous mode: OFF, NORMAL, FULL */ + synchronous?: 'OFF' | 'NORMAL' | 'FULL'; + /** Memory-mapped I/O size in bytes */ + mmapSize?: number; + /** Retry configuration for database operations */ + retry?: DatabaseRetryConfig; +} + +export interface DatabaseRetryConfig { + /** Maximum number of retry attempts for database operations */ + attempts?: number; + /** Initial delay between retries in milliseconds */ + delayMs?: number; + /** Backoff multiplier for exponential backoff */ + backoff?: number; + /** Maximum delay between retries in milliseconds */ + maxDelayMs?: number; +} -export interface RuntimeConfig { - sessionDurationMs?: number; +/** + * Apply SQLite pragmas for optimal performance on distributed filesystems + * Integrates your performance improvements with the new architecture + */ +function configureSqlite(db: Database, config: DatabaseConfig): void { + try { + // Check database integrity first + const integrityResult = db.query("PRAGMA integrity_check").get() as { integrity_check: string }; + if (integrityResult.integrity_check !== "ok") { + throw new Error(`Database integrity check failed: ${integrityResult.integrity_check}`); + } + + // Enable WAL mode for better concurrency (with error handling) + if (config.walMode !== false) { + try { + const result = db.query("PRAGMA journal_mode = WAL").get() as { journal_mode: string }; + if (result.journal_mode !== "wal") { + console.warn("Failed to enable WAL mode, falling back to DELETE mode"); + db.run("PRAGMA journal_mode = DELETE"); + } + } catch (error) { + console.warn("WAL mode failed, using DELETE mode:", error); + db.run("PRAGMA journal_mode = DELETE"); + } + } + + // Set busy timeout for lock handling + if (config.busyTimeoutMs !== undefined) { + db.run(`PRAGMA busy_timeout = ${config.busyTimeoutMs}`); + } + + // Configure cache size + if (config.cacheSize !== undefined) { + db.run(`PRAGMA cache_size = ${config.cacheSize}`); + } + + // Set synchronous mode (more conservative for distributed filesystems) + const syncMode = config.synchronous || 'FULL'; // Default to FULL for safety + db.run(`PRAGMA synchronous = ${syncMode}`); + + // Configure memory-mapped I/O (disable on distributed filesystems if problematic) + if (config.mmapSize !== undefined && config.mmapSize > 0) { + try { + db.run(`PRAGMA mmap_size = ${config.mmapSize}`); + } catch (error) { + console.warn("Memory-mapped I/O failed, disabling:", error); + db.run("PRAGMA mmap_size = 0"); + } + } + + // Additional optimizations for distributed filesystems + db.run("PRAGMA temp_store = MEMORY"); + db.run("PRAGMA foreign_keys = ON"); + + // Add checkpoint interval for WAL mode + db.run("PRAGMA wal_autocheckpoint = 1000"); + + } catch (error) { + console.error("Database configuration failed:", error); + throw new Error(`Failed to configure SQLite database: ${error}`); + } } /** @@ -26,6 +113,8 @@ export interface RuntimeConfig { export class DatabaseOperations implements StrategyStore, Disposable { private db: Database; private runtime?: RuntimeConfig; + private dbConfig: DatabaseConfig; + private retryConfig: DatabaseRetryConfig; // Repositories private accounts: AccountRepository; @@ -35,19 +124,37 @@ export class DatabaseOperations implements StrategyStore, Disposable { private stats: StatsRepository; private agentPreferences: AgentPreferenceRepository; - constructor(dbPath?: string) { + constructor(dbPath?: string, dbConfig?: DatabaseConfig, retryConfig?: DatabaseRetryConfig) { const resolvedPath = dbPath ?? resolveDbPath(); + // Default database configuration optimized for distributed filesystems + // More conservative settings to prevent corruption on Rook Ceph + this.dbConfig = { + walMode: true, + busyTimeoutMs: 10000, // Increased timeout for distributed storage + cacheSize: -10000, // Reduced cache size (10MB) for stability + synchronous: 'FULL', // Full synchronous mode for data safety + mmapSize: 0, // Disable memory-mapped I/O on distributed filesystems + ...dbConfig + }; + + // Default retry configuration for database operations + this.retryConfig = { + attempts: 3, + delayMs: 100, + backoff: 2, + maxDelayMs: 5000, + ...retryConfig + }; + // Ensure the directory exists const dir = dirname(resolvedPath); mkdirSync(dir, { recursive: true }); this.db = new Database(resolvedPath, { create: true }); - // Configure SQLite for better concurrency - this.db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging - this.db.exec("PRAGMA busy_timeout = 5000"); // Wait up to 5 seconds before throwing "database is locked" - this.db.exec("PRAGMA synchronous = NORMAL"); // Better performance while maintaining safety + // Apply SQLite configuration for distributed filesystem optimization + configureSqlite(this.db, this.dbConfig); ensureSchema(this.db); runMigrations(this.db); @@ -63,19 +170,38 @@ export class DatabaseOperations implements StrategyStore, Disposable { setRuntimeConfig(runtime: RuntimeConfig): void { this.runtime = runtime; + + // Update retry config from runtime config if available + if (runtime.database?.retry) { + this.retryConfig = { + ...this.retryConfig, + ...runtime.database.retry + }; + } } getDatabase(): Database { return this.db; } - // Account operations delegated to repository + /** + * Get the current retry configuration + */ + getRetryConfig(): DatabaseRetryConfig { + return this.retryConfig; + } + + // Account operations delegated to repository with retry logic getAllAccounts(): Account[] { - return this.accounts.findAll(); + return withDatabaseRetrySync(() => { + return this.accounts.findAll(); + }, this.retryConfig, "getAllAccounts"); } getAccount(accountId: string): Account | null { - return this.accounts.findById(accountId); + return withDatabaseRetrySync(() => { + return this.accounts.findById(accountId); + }, this.retryConfig, "getAccount"); } updateAccountTokens( @@ -84,17 +210,23 @@ export class DatabaseOperations implements StrategyStore, Disposable { expiresAt: number, refreshToken?: string, ): void { - this.accounts.updateTokens(accountId, accessToken, expiresAt, refreshToken); + withDatabaseRetrySync(() => { + this.accounts.updateTokens(accountId, accessToken, expiresAt, refreshToken); + }, this.retryConfig, "updateAccountTokens"); } updateAccountUsage(accountId: string): void { const sessionDuration = this.runtime?.sessionDurationMs || 5 * 60 * 60 * 1000; - this.accounts.incrementUsage(accountId, sessionDuration); + withDatabaseRetrySync(() => { + this.accounts.incrementUsage(accountId, sessionDuration); + }, this.retryConfig, "updateAccountUsage"); } markAccountRateLimited(accountId: string, until: number): void { - this.accounts.setRateLimited(accountId, until); + withDatabaseRetrySync(() => { + this.accounts.setRateLimited(accountId, until); + }, this.retryConfig, "markAccountRateLimited"); } updateAccountRateLimitMeta( diff --git a/packages/database/src/factory.ts b/packages/database/src/factory.ts index 854e020e..e142f8bb 100644 --- a/packages/database/src/factory.ts +++ b/packages/database/src/factory.ts @@ -1,5 +1,6 @@ import { registerDisposable, unregisterDisposable } from "@ccflare/core"; -import { DatabaseOperations, type RuntimeConfig } from "./index"; +import type { RuntimeConfig } from "@ccflare/config"; +import { DatabaseOperations, type DatabaseConfig, type DatabaseRetryConfig } from "./database-operations"; let instance: DatabaseOperations | null = null; let dbPath: string | undefined; @@ -15,7 +16,18 @@ export function initialize( export function getInstance(): DatabaseOperations { if (!instance) { - instance = new DatabaseOperations(dbPath); + // Extract database configuration from runtime config + const dbConfig: DatabaseConfig | undefined = runtimeConfig?.database ? { + ...(runtimeConfig.database.walMode !== undefined && { walMode: runtimeConfig.database.walMode }), + ...(runtimeConfig.database.busyTimeoutMs !== undefined && { busyTimeoutMs: runtimeConfig.database.busyTimeoutMs }), + ...(runtimeConfig.database.cacheSize !== undefined && { cacheSize: runtimeConfig.database.cacheSize }), + ...(runtimeConfig.database.synchronous !== undefined && { synchronous: runtimeConfig.database.synchronous }), + ...(runtimeConfig.database.mmapSize !== undefined && { mmapSize: runtimeConfig.database.mmapSize }), + } : undefined; + + const retryConfig: DatabaseRetryConfig | undefined = runtimeConfig?.database?.retry; + + instance = new DatabaseOperations(dbPath, dbConfig, retryConfig); if (runtimeConfig) { instance.setRuntimeConfig(runtimeConfig); } diff --git a/packages/database/src/index.ts b/packages/database/src/index.ts index da488b0d..2ed2e612 100644 --- a/packages/database/src/index.ts +++ b/packages/database/src/index.ts @@ -4,7 +4,8 @@ export { DatabaseOperations }; // Re-export other utilities export { AsyncDbWriter } from "./async-writer"; -export type { RuntimeConfig } from "./database-operations"; +export type { RuntimeConfig } from "@ccflare/config"; +export type { DatabaseConfig, DatabaseRetryConfig } from "./database-operations"; export { DatabaseFactory } from "./factory"; export { ensureSchema, runMigrations } from "./migrations"; export { resolveDbPath } from "./paths"; @@ -12,3 +13,6 @@ export { analyzeIndexUsage } from "./performance-indexes"; // Re-export repository types export type { StatsRepository } from "./repositories/stats.repository"; + +// Re-export retry utilities for external use (from your improvements) +export { withDatabaseRetry, withDatabaseRetrySync } from "./retry"; diff --git a/packages/database/src/migrations.ts b/packages/database/src/migrations.ts index 0afb29eb..9182c573 100644 --- a/packages/database/src/migrations.ts +++ b/packages/database/src/migrations.ts @@ -50,11 +50,21 @@ export function ensureSchema(db: Database): void { ) `); - // Create index for faster queries + // Create indexes for faster queries db.run( `CREATE INDEX IF NOT EXISTS idx_requests_timestamp ON requests(timestamp DESC)`, ); + // Index for JOIN performance with accounts table + db.run( + `CREATE INDEX IF NOT EXISTS idx_requests_account_used ON requests(account_used)`, + ); + + // Composite index for the main requests query (timestamp DESC with account_used for JOIN) + db.run( + `CREATE INDEX IF NOT EXISTS idx_requests_timestamp_account ON requests(timestamp DESC, account_used)`, + ); + // Create request_payloads table for storing full request/response data db.run(` CREATE TABLE IF NOT EXISTS request_payloads ( diff --git a/packages/database/src/retry.ts b/packages/database/src/retry.ts new file mode 100644 index 00000000..c31727dc --- /dev/null +++ b/packages/database/src/retry.ts @@ -0,0 +1,177 @@ +import { Logger } from "@ccflare/logger"; +import type { DatabaseRetryConfig } from "./database-operations"; + +const logger = new Logger("db-retry"); + +/** + * Error codes that indicate database lock contention and should trigger retries + */ +const RETRYABLE_SQLITE_ERRORS = [ + "SQLITE_BUSY", + "SQLITE_LOCKED", + "database is locked", + "database table is locked", +]; + +/** + * Check if an error is retryable (indicates database lock contention) + */ +function isRetryableError(error: unknown): boolean { + if (!error) return false; + + const errorMessage = error instanceof Error ? error.message : String(error); + const errorCode = (error as any)?.code; + + return RETRYABLE_SQLITE_ERRORS.some(retryableError => + errorMessage.includes(retryableError) || errorCode === retryableError + ); +} + +/** + * Calculate delay for exponential backoff with jitter + */ +function calculateDelay(attempt: number, config: Required): number { + const baseDelay = config.delayMs * Math.pow(config.backoff, attempt); + const jitter = Math.random() * 0.1 * baseDelay; // Add 10% jitter + const delayWithJitter = baseDelay + jitter; + + return Math.min(delayWithJitter, config.maxDelayMs); +} + +/** + * Sleep for the specified number of milliseconds + */ +function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +/** + * Synchronous sleep function + */ +function sleepSync(ms: number): void { + // Synchronous sleep using Bun.sleepSync if available, otherwise Node.js fallback + if (typeof Bun !== 'undefined' && Bun.sleepSync) { + Bun.sleepSync(ms); + } else { + // Try Node.js child_process.spawnSync as fallback + try { + const { spawnSync } = require('child_process'); + const sleepCommand = process.platform === 'win32' ? 'timeout' : 'sleep'; + const sleepArg = process.platform === 'win32' ? `/t ${Math.ceil(ms / 1000)}` : `${ms / 1000}`; + + spawnSync(sleepCommand, [sleepArg], { + stdio: 'ignore', + shell: process.platform === 'win32' + }); + } catch (error) { + // If child_process is not available or fails, throw an error instead of busy waiting + throw new Error( + `Synchronous sleep not supported in this environment. ` + + `Bun.sleepSync is not available and Node.js child_process failed: ${error instanceof Error ? error.message : String(error)}` + ); + } + } +} + +/** + * Shared retry logic for both async and sync operations + */ +function executeWithRetry( + operation: () => T, + config: Required, + operationName: string, + sleepFn: (ms: number) => void | Promise +): T | Promise { + let lastError: unknown; + + for (let attempt = 0; attempt < config.attempts; attempt++) { + try { + const result = operation(); + + // Log successful retry if this wasn't the first attempt + if (attempt > 0) { + logger.info(`${operationName} succeeded after ${attempt + 1} attempts`); + } + + return result; + } catch (error) { + lastError = error; + + // Check if this is a retryable error + if (!isRetryableError(error)) { + logger.debug(`${operationName} failed with non-retryable error:`, error); + throw error; + } + + // If this was the last attempt, throw the error + if (attempt === config.attempts - 1) { + logger.error(`${operationName} failed after ${config.attempts} attempts:`, error); + throw error; + } + + // Calculate delay and wait before retry + const delay = calculateDelay(attempt, config); + logger.warn( + `${operationName} failed (attempt ${attempt + 1}/${config.attempts}), retrying in ${delay.toFixed(0)}ms:`, + error instanceof Error ? error.message : String(error) + ); + + const sleepResult = sleepFn(delay); + // If sleepFn returns a Promise, we need to await it + if (sleepResult instanceof Promise) { + return sleepResult.then(() => executeWithRetry(operation, config, operationName, sleepFn)) as Promise; + } + } + } + + // This should never be reached, but TypeScript requires it + throw lastError; +} + +/** + * Retry wrapper for database operations with exponential backoff + */ +export async function withDatabaseRetry( + operation: () => T | Promise, + config: DatabaseRetryConfig = {}, + operationName = "database operation" +): Promise { + const retryConfig: Required = { + attempts: 3, + delayMs: 100, + backoff: 2, + maxDelayMs: 5000, + ...config, + }; + + return executeWithRetry( + async () => await operation(), + retryConfig, + operationName, + sleep + ) as Promise; +} + +/** + * Synchronous retry wrapper for database operations + */ +export function withDatabaseRetrySync( + operation: () => T, + config: DatabaseRetryConfig = {}, + operationName = "database operation" +): T { + const retryConfig: Required = { + attempts: 3, + delayMs: 100, + backoff: 2, + maxDelayMs: 5000, + ...config, + }; + + return executeWithRetry( + operation, + retryConfig, + operationName, + sleepSync + ) as T; +} diff --git a/packages/http-api/src/handlers/requests.ts b/packages/http-api/src/handlers/requests.ts index cc584073..dbd0efde 100644 --- a/packages/http-api/src/handlers/requests.ts +++ b/packages/http-api/src/handlers/requests.ts @@ -1,5 +1,6 @@ import type { Database } from "bun:sqlite"; import type { DatabaseOperations } from "@ccflare/database"; +import { validateString } from "@ccflare/core"; import { jsonResponse } from "@ccflare/http-common"; import type { RequestResponse } from "../types"; @@ -94,3 +95,37 @@ export function createRequestsDetailHandler(dbOps: DatabaseOperations) { return jsonResponse(parsed); }; } + +/** + * Create a handler for individual request payload retrieval + */ +export function createRequestPayloadHandler(dbOps: DatabaseOperations) { + return (requestId: string): Response => { + // Validate requestId parameter + try { + validateString(requestId, 'requestId', { + required: true, + minLength: 1, + maxLength: 255, + pattern: /^[a-zA-Z0-9\-_]+$/ + }); + } catch (error) { + return jsonResponse( + { error: 'Invalid request ID format' }, + 400 + ); + } + + const payload = dbOps.getRequestPayload(requestId); + + if (!payload) { + return jsonResponse( + { error: 'Request not found' }, + 404 + ); + } + + // The payload is already parsed by the repository, return it directly + return jsonResponse(payload); + }; +} diff --git a/packages/http-api/src/router.ts b/packages/http-api/src/router.ts index bbb0d3a0..fbd3aae8 100644 --- a/packages/http-api/src/router.ts +++ b/packages/http-api/src/router.ts @@ -26,6 +26,7 @@ import { import { createRequestsDetailHandler, createRequestsSummaryHandler, + createRequestPayloadHandler, } from "./handlers/requests"; import { createRequestsStreamHandler } from "./handlers/requests-stream"; import { createStatsHandler, createStatsResetHandler } from "./handlers/stats"; @@ -104,6 +105,7 @@ export class APIRouter { this.handlers.set("GET:/api/requests/stream", () => requestsStreamHandler(), ); + // Note: Dynamic route for request payloads is handled in the handleRequest() method this.handlers.set("GET:/api/config", () => configHandlers.getConfig()); this.handlers.set("GET:/api/config/strategy", () => configHandlers.getStrategy(), @@ -164,6 +166,14 @@ export class APIRouter { return await this.wrapHandler(handler)(req, url); } + // Check for dynamic request payload endpoints + if (path.startsWith("/api/requests/payload/") && method === "GET") { + const parts = path.split("/"); + const requestId = parts[4]; // /api/requests/payload/{id} + const requestPayloadHandler = createRequestPayloadHandler(this.context.dbOps); + return await this.wrapHandler(() => requestPayloadHandler(requestId))(req, url); + } + // Check for dynamic account endpoints if (path.startsWith("/api/accounts/")) { const parts = path.split("/"); diff --git a/packages/tui-core/src/requests.ts b/packages/tui-core/src/requests.ts index 9af1c13b..bb299295 100644 --- a/packages/tui-core/src/requests.ts +++ b/packages/tui-core/src/requests.ts @@ -1,4 +1,4 @@ -import { DatabaseFactory } from "@ccflare/database"; +import { DatabaseFactory, withDatabaseRetrySync } from "@ccflare/database"; import type { RequestPayload } from "@ccflare/types"; export type { RequestPayload }; @@ -17,17 +17,19 @@ export interface RequestSummary { export async function getRequests(limit = 100): Promise { const dbOps = DatabaseFactory.getInstance(); - const rows = dbOps.listRequestPayloads(limit); - const parsed = rows.map((r: { id: string; json: string }) => { + // Use the optimized database method that includes account names in a single JOIN + // This eliminates N+1 queries and uses the performance-optimized method + const rows = withDatabaseRetrySync(() => { + return dbOps.listRequestPayloadsWithAccountNames(limit); + }, dbOps.getRetryConfig(), "getRequests"); + + const parsed = rows.map((r: { id: string; json: string; account_name: string | null }) => { try { const data = JSON.parse(r.json); - // Add account name if we have accountId - if (data.meta?.accountId) { - const account = dbOps.getAccount(data.meta.accountId); - if (account) { - data.meta.accountName = account.name; - } + // Add account name from the JOIN result (no additional query needed) + if (r.account_name && data.meta) { + data.meta.accountName = r.account_name; } return { id: r.id, ...data } as RequestPayload; } catch { @@ -44,29 +46,45 @@ export async function getRequests(limit = 100): Promise { return parsed; } +/** + * Get full request payload data for a specific request (for detailed view) + */ +export async function getRequestPayload(requestId: string): Promise { + const dbOps = DatabaseFactory.getInstance(); + + const payload = withDatabaseRetrySync(() => { + return dbOps.getRequestPayload(requestId); + }, dbOps.getRetryConfig(), "getRequestPayload"); + + return payload as RequestPayload | null; +} + export async function getRequestSummaries( limit = 100, ): Promise> { const dbOps = DatabaseFactory.getInstance(); - const db = dbOps.getDatabase(); - const summaries = db - .query(` - SELECT - id, - model, - input_tokens as inputTokens, - output_tokens as outputTokens, - total_tokens as totalTokens, - cache_read_input_tokens as cacheReadInputTokens, - cache_creation_input_tokens as cacheCreationInputTokens, - cost_usd as costUsd, - response_time_ms as responseTimeMs - FROM requests - ORDER BY timestamp DESC - LIMIT ? - `) - .all(limit) as Array<{ + // Use retry logic for the database query + const summaries = withDatabaseRetrySync(() => { + const db = dbOps.getDatabase(); + return db + .query(` + SELECT + id, + model, + input_tokens as inputTokens, + output_tokens as outputTokens, + total_tokens as totalTokens, + cache_read_input_tokens as cacheReadInputTokens, + cache_creation_input_tokens as cacheCreationInputTokens, + cost_usd as costUsd, + response_time_ms as responseTimeMs + FROM requests + ORDER BY timestamp DESC + LIMIT ? + `) + .all(limit); + }, dbOps.getRetryConfig(), "getRequestSummaries") as Array<{ id: string; model?: string; inputTokens?: number; diff --git a/scripts/diagnose-database.sh b/scripts/diagnose-database.sh new file mode 100644 index 00000000..2a607350 --- /dev/null +++ b/scripts/diagnose-database.sh @@ -0,0 +1,175 @@ +#!/bin/bash +# Database diagnostic script - READ-ONLY analysis +# Usage: kubectl exec -it -n coder -- /app/scripts/diagnose-database.sh + +set -e + +DB_PATH="/app/data/ccflare.db" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) + +echo "๐Ÿ” Database Diagnostic Report" +echo "Timestamp: $TIMESTAMP" +echo "Database path: $DB_PATH" +echo "========================================" +echo "" + +# File system analysis +echo "๐Ÿ“ FILE SYSTEM ANALYSIS:" +echo "------------------------" +if [ -f "$DB_PATH" ]; then + echo "โœ… Main database file exists" + ls -la "$DB_PATH" + echo "File type: $(file "$DB_PATH")" + echo "File size: $(du -h "$DB_PATH" | cut -f1)" +else + echo "โŒ Main database file missing: $DB_PATH" +fi + +if [ -f "$DB_PATH-wal" ]; then + echo "โœ… WAL file exists" + ls -la "$DB_PATH-wal" + echo "WAL size: $(du -h "$DB_PATH-wal" | cut -f1)" +else + echo "โ„น๏ธ No WAL file found" +fi + +if [ -f "$DB_PATH-shm" ]; then + echo "โœ… SHM file exists" + ls -la "$DB_PATH-shm" +else + echo "โ„น๏ธ No SHM file found" +fi + +echo "" + +# Database header analysis +echo "๐Ÿ”ฌ DATABASE HEADER ANALYSIS:" +echo "----------------------------" +if [ -f "$DB_PATH" ]; then + echo "First 100 bytes of database file:" + hexdump -C "$DB_PATH" | head -5 + echo "" + + # Check SQLite magic number + MAGIC=$(hexdump -C "$DB_PATH" | head -1 | cut -d' ' -f2-5) + if [[ "$MAGIC" == "53 51 4c 69" ]]; then + echo "โœ… SQLite magic number present (53 51 4c 69)" + else + echo "โŒ Invalid SQLite magic number: $MAGIC" + echo " Expected: 53 51 4c 69 (SQLi)" + fi +fi + +echo "" + +# SQLite integrity checks +echo "๐Ÿ” SQLITE INTEGRITY CHECKS:" +echo "---------------------------" +if [ -f "$DB_PATH" ]; then + echo "Testing database connectivity..." + if sqlite3 "$DB_PATH" "SELECT 1;" 2>/dev/null >/dev/null; then + echo "โœ… Database is accessible" + + echo "" + echo "Journal mode:" + sqlite3 "$DB_PATH" "PRAGMA journal_mode;" 2>/dev/null || echo "โŒ Cannot read journal mode" + + echo "" + echo "Database schema version:" + sqlite3 "$DB_PATH" "PRAGMA schema_version;" 2>/dev/null || echo "โŒ Cannot read schema version" + + echo "" + echo "Page size:" + sqlite3 "$DB_PATH" "PRAGMA page_size;" 2>/dev/null || echo "โŒ Cannot read page size" + + echo "" + echo "Database size info:" + sqlite3 "$DB_PATH" "PRAGMA page_count; PRAGMA freelist_count;" 2>/dev/null || echo "โŒ Cannot read size info" + + echo "" + echo "Integrity check:" + INTEGRITY=$(sqlite3 "$DB_PATH" "PRAGMA integrity_check;" 2>/dev/null || echo "FAILED") + if [[ "$INTEGRITY" == "ok" ]]; then + echo "โœ… Database integrity: OK" + else + echo "โŒ Database integrity: $INTEGRITY" + fi + + echo "" + echo "Quick corruption check:" + sqlite3 "$DB_PATH" "PRAGMA quick_check;" 2>/dev/null || echo "โŒ Quick check failed" + + else + echo "โŒ Database is not accessible" + echo "Error details:" + sqlite3 "$DB_PATH" "SELECT 1;" 2>&1 || true + fi +fi + +echo "" + +# Table analysis +echo "๐Ÿ“Š TABLE ANALYSIS:" +echo "------------------" +if sqlite3 "$DB_PATH" "SELECT 1;" 2>/dev/null >/dev/null; then + echo "Database tables:" + sqlite3 "$DB_PATH" ".tables" 2>/dev/null || echo "โŒ Cannot list tables" + + echo "" + echo "Table row counts:" + for table in $(sqlite3 "$DB_PATH" ".tables" 2>/dev/null); do + count=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM $table;" 2>/dev/null || echo "ERROR") + echo " $table: $count rows" + done + + echo "" + echo "Recent requests (if accessible):" + sqlite3 "$DB_PATH" "SELECT id, timestamp, success FROM requests ORDER BY timestamp DESC LIMIT 5;" 2>/dev/null || echo "โŒ Cannot read requests table" +fi + +echo "" + +# WAL analysis +echo "๐Ÿ“ WAL FILE ANALYSIS:" +echo "---------------------" +if [ -f "$DB_PATH-wal" ]; then + echo "WAL file header:" + hexdump -C "$DB_PATH-wal" | head -3 + + echo "" + echo "WAL checkpoint status:" + sqlite3 "$DB_PATH" "PRAGMA wal_checkpoint;" 2>/dev/null || echo "โŒ WAL checkpoint failed" + + echo "" + echo "WAL autocheckpoint setting:" + sqlite3 "$DB_PATH" "PRAGMA wal_autocheckpoint;" 2>/dev/null || echo "โŒ Cannot read WAL autocheckpoint" +else + echo "โ„น๏ธ No WAL file to analyze" +fi + +echo "" + +# Recovery recommendations +echo "๐Ÿ’ก RECOVERY RECOMMENDATIONS:" +echo "----------------------------" +if sqlite3 "$DB_PATH" "PRAGMA integrity_check;" 2>/dev/null | grep -q "ok"; then + echo "โœ… Database appears healthy" + echo " - Try restarting the application" + echo " - Check for file locking issues" + echo " - Verify file permissions" +else + echo "โŒ Database corruption detected" + echo "" + echo "Safe recovery steps to try:" + echo "1. WAL checkpoint: sqlite3 $DB_PATH 'PRAGMA wal_checkpoint(FULL);'" + echo "2. Vacuum: sqlite3 $DB_PATH 'VACUUM;'" + echo "3. Dump data: sqlite3 $DB_PATH '.dump' > /app/data/backups/dump_$TIMESTAMP.sql" + echo "4. Recovery mode: sqlite3 $DB_PATH '.recover' > /app/data/backups/recover_$TIMESTAMP.sql" + echo "" + echo "โš ๏ธ DO NOT delete database files without manual review" +fi + +echo "" +echo "========================================" +echo "๐Ÿ” Diagnostic complete: $TIMESTAMP" +echo "๐Ÿ“ Save this output for analysis" diff --git a/scripts/fix-database-corruption.sh b/scripts/fix-database-corruption.sh new file mode 100644 index 00000000..0fa0fc48 --- /dev/null +++ b/scripts/fix-database-corruption.sh @@ -0,0 +1,129 @@ +#!/bin/bash +# Emergency database corruption fix script for Kubernetes pods + +set -e + +# Detect environment (pod vs traditional) +if [ -f /.dockerenv ] || [ -n "$KUBERNETES_SERVICE_HOST" ]; then + echo "๐Ÿณ Detected containerized environment" + DB_PATH="${1:-/app/data/ccflare.db}" + BACKUP_DIR="/app/data/backups" + IS_CONTAINER=true +else + echo "๐Ÿ–ฅ๏ธ Detected traditional environment" + DB_PATH="${1:-/opt/ccflare/data/ccflare.db}" + BACKUP_DIR="/opt/ccflare/data/backups" + IS_CONTAINER=false +fi + +TIMESTAMP=$(date +%Y%m%d_%H%M%S) + +echo "๐Ÿšจ Emergency Database Corruption Fix" +echo "Database path: $DB_PATH" +echo "Backup directory: $BACKUP_DIR" +echo "Timestamp: $TIMESTAMP" +echo "Container mode: $IS_CONTAINER" + +# Create backup directory +mkdir -p "$BACKUP_DIR" + +# Stop the service (different methods for container vs traditional) +if [ "$IS_CONTAINER" = "true" ]; then + echo "๐Ÿ“› Container mode: Cannot stop service, manual intervention required" + echo " Please scale down the deployment or kill the main process" + echo " kubectl scale deployment ccflare --replicas=0 -n coder" + echo " Then run this script and scale back up" +else + echo "๐Ÿ“› Stopping ccflare service..." + systemctl stop ccflare || echo "Service not running or not systemd" +fi + +# Backup corrupted files +echo "๐Ÿ’พ Backing up corrupted database files..." +if [ -f "$DB_PATH" ]; then + cp "$DB_PATH" "$BACKUP_DIR/ccflare.db.corrupted.$TIMESTAMP" +fi +if [ -f "$DB_PATH-wal" ]; then + cp "$DB_PATH-wal" "$BACKUP_DIR/ccflare.db-wal.corrupted.$TIMESTAMP" +fi +if [ -f "$DB_PATH-shm" ]; then + cp "$DB_PATH-shm" "$BACKUP_DIR/ccflare.db-shm.corrupted.$TIMESTAMP" +fi + +# Try to recover using WAL file +echo "๐Ÿ”ง Attempting WAL recovery..." +if [ -f "$DB_PATH-wal" ] && [ -s "$DB_PATH-wal" ]; then + echo "WAL file exists and has data, attempting recovery..." + + # Try to checkpoint the WAL file + sqlite3 "$DB_PATH" "PRAGMA wal_checkpoint(FULL);" 2>/dev/null || { + echo "โŒ WAL checkpoint failed, database is severely corrupted" + + # Try to dump and restore from WAL + echo "๐Ÿ”„ Attempting dump/restore recovery..." + sqlite3 "$DB_PATH" ".dump" > "$BACKUP_DIR/recovery_dump.$TIMESTAMP.sql" 2>/dev/null || { + echo "โŒ Cannot dump database, creating fresh database" + + # Remove corrupted files + rm -f "$DB_PATH" "$DB_PATH-wal" "$DB_PATH-shm" + + # Create fresh database (will be initialized by application) + echo "๐Ÿ†• Creating fresh database (data will be lost)" + touch "$DB_PATH" + } + + if [ -f "$BACKUP_DIR/recovery_dump.$TIMESTAMP.sql" ] && [ -s "$BACKUP_DIR/recovery_dump.$TIMESTAMP.sql" ]; then + echo "โœ… Dump successful, restoring database..." + rm -f "$DB_PATH" "$DB_PATH-wal" "$DB_PATH-shm" + sqlite3 "$DB_PATH" < "$BACKUP_DIR/recovery_dump.$TIMESTAMP.sql" + echo "โœ… Database restored from dump" + fi + } +else + echo "โŒ No WAL file or empty WAL file, cannot recover" + rm -f "$DB_PATH" "$DB_PATH-wal" "$DB_PATH-shm" + echo "๐Ÿ†• Creating fresh database (data will be lost)" + touch "$DB_PATH" +fi + +# Verify database integrity +echo "๐Ÿ” Verifying database integrity..." +if sqlite3 "$DB_PATH" "PRAGMA integrity_check;" | grep -q "ok"; then + echo "โœ… Database integrity check passed" +else + echo "โŒ Database integrity check failed, recreating..." + rm -f "$DB_PATH" "$DB_PATH-wal" "$DB_PATH-shm" + touch "$DB_PATH" +fi + +# Set proper permissions +if [ "$IS_CONTAINER" = "true" ]; then + # In container, we're already running as ccflare user + chmod 664 "$DB_PATH" 2>/dev/null || echo "Could not set permissions" +else + chown ccflare:ccflare "$DB_PATH" 2>/dev/null || echo "Could not set ownership" + chmod 664 "$DB_PATH" 2>/dev/null || echo "Could not set permissions" +fi + +# Start the service (different methods for container vs traditional) +if [ "$IS_CONTAINER" = "true" ]; then + echo "๐Ÿ”„ Container mode: Manual restart required" + echo " Scale the deployment back up:" + echo " kubectl scale deployment ccflare --replicas=1 -n coder" + echo " Or restart the pod:" + echo " kubectl delete pod -l app=ccflare -n coder" +else + echo "๐Ÿ”„ Starting ccflare service..." + systemctl start ccflare || echo "Could not start service via systemctl" +fi + +echo "โœ… Database corruption fix completed" +echo "๐Ÿ“ Backup files saved in: $BACKUP_DIR" + +if [ "$IS_CONTAINER" = "true" ]; then + echo "๐Ÿ“Š Check pod status: kubectl get pods -l app=ccflare -n coder" + echo "๏ฟฝ Check logs: kubectl logs -l app=ccflare -n coder -f" +else + echo "๏ฟฝ๐Ÿ“Š Check service status: systemctl status ccflare" + echo "๐Ÿ“‹ Check logs: journalctl -u ccflare -f" +fi diff --git a/scripts/manual-recovery.sh b/scripts/manual-recovery.sh new file mode 100644 index 00000000..019fbfd5 --- /dev/null +++ b/scripts/manual-recovery.sh @@ -0,0 +1,189 @@ +#!/bin/bash +# Manual database recovery script with confirmation prompts +# Usage: kubectl exec -it -n coder -- /app/scripts/manual-recovery.sh + +set -e + +DB_PATH="/app/data/ccflare.db" +BACKUP_DIR="/app/data/backups" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) + +echo "๐Ÿ”ง Manual Database Recovery Assistant" +echo "Database: $DB_PATH" +echo "Timestamp: $TIMESTAMP" +echo "" + +# Create backup directory +mkdir -p "$BACKUP_DIR" + +# Function to ask for confirmation +confirm() { + echo -n "$1 (y/N): " + read -r response + case "$response" in + [yY][eE][sS]|[yY]) + return 0 + ;; + *) + return 1 + ;; + esac +} + +# Step 1: Backup current state +echo "STEP 1: Backup current database state" +echo "======================================" +if confirm "Create backup of current database files?"; then + if [ -f "$DB_PATH" ]; then + cp "$DB_PATH" "$BACKUP_DIR/ccflare.db.backup.$TIMESTAMP" + echo "โœ… Backed up main database" + fi + if [ -f "$DB_PATH-wal" ]; then + cp "$DB_PATH-wal" "$BACKUP_DIR/ccflare.db-wal.backup.$TIMESTAMP" + echo "โœ… Backed up WAL file" + fi + if [ -f "$DB_PATH-shm" ]; then + cp "$DB_PATH-shm" "$BACKUP_DIR/ccflare.db-shm.backup.$TIMESTAMP" + echo "โœ… Backed up SHM file" + fi + echo "๐Ÿ“ Backups saved in: $BACKUP_DIR" +else + echo "โš ๏ธ Skipping backup - proceeding without safety net" +fi + +echo "" + +# Step 2: Integrity check +echo "STEP 2: Database integrity check" +echo "================================" +if sqlite3 "$DB_PATH" "PRAGMA integrity_check;" 2>/dev/null | grep -q "ok"; then + echo "โœ… Database integrity: OK" + echo " The database may not be corrupted. Check for:" + echo " - File locking issues" + echo " - Permission problems" + echo " - Concurrent access" + exit 0 +else + echo "โŒ Database integrity check failed" + echo " Corruption detected - proceeding with recovery" +fi + +echo "" + +# Step 3: WAL checkpoint +echo "STEP 3: WAL checkpoint recovery" +echo "===============================" +if [ -f "$DB_PATH-wal" ] && [ -s "$DB_PATH-wal" ]; then + echo "WAL file found with data" + if confirm "Attempt WAL checkpoint to recover recent transactions?"; then + if sqlite3 "$DB_PATH" "PRAGMA wal_checkpoint(FULL);" 2>/dev/null; then + echo "โœ… WAL checkpoint successful" + + # Check if this fixed the corruption + if sqlite3 "$DB_PATH" "PRAGMA integrity_check;" 2>/dev/null | grep -q "ok"; then + echo "๐ŸŽ‰ Database recovered via WAL checkpoint!" + echo " Cleaning up WAL files..." + rm -f "$DB_PATH-wal" "$DB_PATH-shm" + echo "โœ… Recovery complete" + exit 0 + else + echo "โŒ WAL checkpoint didn't fix corruption" + fi + else + echo "โŒ WAL checkpoint failed" + fi + else + echo "โญ๏ธ Skipping WAL checkpoint" + fi +else + echo "โ„น๏ธ No WAL file or empty WAL file" +fi + +echo "" + +# Step 4: Database dump +echo "STEP 4: Database dump recovery" +echo "=============================" +if confirm "Attempt to dump readable data from database?"; then + DUMP_FILE="$BACKUP_DIR/recovery_dump.$TIMESTAMP.sql" + echo "Dumping database to: $DUMP_FILE" + + if sqlite3 "$DB_PATH" ".dump" > "$DUMP_FILE" 2>/dev/null && [ -s "$DUMP_FILE" ]; then + echo "โœ… Database dump successful" + echo " Dump size: $(du -h "$DUMP_FILE" | cut -f1)" + + if confirm "Create new database from dump? (REPLACES CURRENT DATABASE)"; then + echo "โš ๏ธ Creating new database from dump..." + + # Move corrupted files + mv "$DB_PATH" "$BACKUP_DIR/ccflare.db.corrupted.$TIMESTAMP" 2>/dev/null || true + mv "$DB_PATH-wal" "$BACKUP_DIR/ccflare.db-wal.corrupted.$TIMESTAMP" 2>/dev/null || true + mv "$DB_PATH-shm" "$BACKUP_DIR/ccflare.db-shm.corrupted.$TIMESTAMP" 2>/dev/null || true + + # Restore from dump + if sqlite3 "$DB_PATH" < "$DUMP_FILE" 2>/dev/null; then + echo "โœ… Database restored from dump" + + # Verify restored database + if sqlite3 "$DB_PATH" "PRAGMA integrity_check;" 2>/dev/null | grep -q "ok"; then + echo "๐ŸŽ‰ Database recovery successful!" + echo " Restored database passes integrity check" + exit 0 + else + echo "โŒ Restored database failed integrity check" + echo " Manual intervention required" + fi + else + echo "โŒ Failed to restore database from dump" + fi + else + echo "โญ๏ธ Dump created but not applied" + echo " Manual restore: sqlite3 $DB_PATH < $DUMP_FILE" + fi + else + echo "โŒ Database dump failed" + fi +else + echo "โญ๏ธ Skipping database dump" +fi + +echo "" + +# Step 5: Advanced recovery +echo "STEP 5: Advanced recovery options" +echo "=================================" +echo "Manual recovery commands to try:" +echo "" +echo "1. SQLite recovery mode:" +echo " sqlite3 $DB_PATH '.recover' > $BACKUP_DIR/recover_$TIMESTAMP.sql" +echo "" +echo "2. Partial dump (skip errors):" +echo " sqlite3 $DB_PATH '.dump' | grep -v '^ROLLBACK' > $BACKUP_DIR/partial_$TIMESTAMP.sql" +echo "" +echo "3. Change journal mode:" +echo " sqlite3 $DB_PATH 'PRAGMA journal_mode=DELETE; VACUUM;'" +echo "" +echo "4. Examine specific tables:" +echo " sqlite3 $DB_PATH 'SELECT COUNT(*) FROM requests;'" +echo " sqlite3 $DB_PATH 'SELECT * FROM requests LIMIT 10;'" +echo "" + +if confirm "Run SQLite recovery mode (.recover)?"; then + RECOVER_FILE="$BACKUP_DIR/recover_$TIMESTAMP.sql" + echo "Running recovery mode..." + if sqlite3 "$DB_PATH" ".recover" > "$RECOVER_FILE" 2>/dev/null; then + echo "โœ… Recovery mode completed" + echo " Output: $RECOVER_FILE" + echo " Size: $(du -h "$RECOVER_FILE" | cut -f1)" + else + echo "โŒ Recovery mode failed" + fi +fi + +echo "" +echo "๐Ÿ”ง Manual recovery session complete" +echo "๐Ÿ“ All files saved in: $BACKUP_DIR" +echo "โš ๏ธ If recovery failed, consider:" +echo " - Restoring from external backups" +echo " - Contacting database administrator" +echo " - Creating fresh database (DATA LOSS)" diff --git a/scripts/pod-db-repair.sh b/scripts/pod-db-repair.sh new file mode 100644 index 00000000..1897c586 --- /dev/null +++ b/scripts/pod-db-repair.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# Emergency database repair script for running inside Kubernetes pod +# Usage: kubectl exec -it -n coder -- /app/scripts/pod-db-repair.sh + +set -e + +DB_PATH="/app/data/ccflare.db" +BACKUP_DIR="/app/data/backups" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) + +echo "๐Ÿšจ Pod Database Emergency Repair" +echo "Database path: $DB_PATH" +echo "Timestamp: $TIMESTAMP" +echo "" + +# Create backup directory +mkdir -p "$BACKUP_DIR" + +# Check if database files exist +if [ ! -f "$DB_PATH" ]; then + echo "โŒ Database file not found: $DB_PATH" + echo "Creating empty database file..." + touch "$DB_PATH" + echo "โœ… Empty database created. Application will initialize schema on startup." + exit 0 +fi + +echo "๐Ÿ“Š Database file info:" +ls -la "$DB_PATH"* 2>/dev/null || echo "No database files found" +echo "" + +# Backup corrupted files +echo "๐Ÿ’พ Backing up database files..." +if [ -f "$DB_PATH" ]; then + cp "$DB_PATH" "$BACKUP_DIR/ccflare.db.corrupted.$TIMESTAMP" + echo "โœ… Backed up main database file" +fi +if [ -f "$DB_PATH-wal" ]; then + cp "$DB_PATH-wal" "$BACKUP_DIR/ccflare.db-wal.corrupted.$TIMESTAMP" + echo "โœ… Backed up WAL file" +fi +if [ -f "$DB_PATH-shm" ]; then + cp "$DB_PATH-shm" "$BACKUP_DIR/ccflare.db-shm.corrupted.$TIMESTAMP" + echo "โœ… Backed up SHM file" +fi + +# Check database integrity +echo "" +echo "๐Ÿ” Checking database integrity..." +if sqlite3 "$DB_PATH" "PRAGMA integrity_check;" 2>/dev/null | grep -q "ok"; then + echo "โœ… Database integrity check passed - database is not corrupted!" + echo "The SQLITE_NOTADB error might be due to file locking or permissions." + echo "Try restarting the pod: kubectl delete pod -l app=ccflare -n coder" + exit 0 +else + echo "โŒ Database integrity check failed - attempting repair..." +fi + +# Try WAL recovery first +echo "" +echo "๐Ÿ”ง Attempting WAL recovery..." +if [ -f "$DB_PATH-wal" ] && [ -s "$DB_PATH-wal" ]; then + echo "WAL file exists and has data, attempting checkpoint..." + + if sqlite3 "$DB_PATH" "PRAGMA wal_checkpoint(FULL);" 2>/dev/null; then + echo "โœ… WAL checkpoint successful" + + # Verify integrity after checkpoint + if sqlite3 "$DB_PATH" "PRAGMA integrity_check;" 2>/dev/null | grep -q "ok"; then + echo "โœ… Database repaired successfully via WAL checkpoint!" + rm -f "$DB_PATH-wal" "$DB_PATH-shm" 2>/dev/null + echo "๐Ÿงน Cleaned up WAL files" + exit 0 + fi + else + echo "โŒ WAL checkpoint failed" + fi +fi + +# Try dump and restore +echo "" +echo "๐Ÿ”„ Attempting dump and restore recovery..." +DUMP_FILE="$BACKUP_DIR/recovery_dump.$TIMESTAMP.sql" + +if sqlite3 "$DB_PATH" ".dump" > "$DUMP_FILE" 2>/dev/null && [ -s "$DUMP_FILE" ]; then + echo "โœ… Database dump successful" + + # Create new database from dump + rm -f "$DB_PATH" "$DB_PATH-wal" "$DB_PATH-shm" + + if sqlite3 "$DB_PATH" < "$DUMP_FILE" 2>/dev/null; then + echo "โœ… Database restored from dump" + + # Verify restored database + if sqlite3 "$DB_PATH" "PRAGMA integrity_check;" 2>/dev/null | grep -q "ok"; then + echo "โœ… Restored database integrity verified!" + exit 0 + else + echo "โŒ Restored database failed integrity check" + fi + else + echo "โŒ Failed to restore database from dump" + fi +else + echo "โŒ Failed to dump database" +fi + +# Manual intervention required +echo "" +echo "โŒ Automatic recovery failed - manual intervention required" +echo "" +echo "๐Ÿ” DIAGNOSIS COMPLETE:" +echo " - Database integrity check failed" +echo " - WAL checkpoint failed or no WAL file" +echo " - Dump and restore failed" +echo "" +echo "๐Ÿ“‹ MANUAL RECOVERY OPTIONS:" +echo "" +echo "1. ๐Ÿ”ง Try advanced SQLite recovery tools:" +echo " sqlite3 $DB_PATH '.recover' > $BACKUP_DIR/recovered_data.$TIMESTAMP.sql" +echo " sqlite3 $DB_PATH '.dump' | grep -v '^ROLLBACK' > $BACKUP_DIR/partial_dump.$TIMESTAMP.sql" +echo "" +echo "2. ๐Ÿ” Examine database structure:" +echo " sqlite3 $DB_PATH '.schema'" +echo " sqlite3 $DB_PATH 'PRAGMA table_info(requests);'" +echo " sqlite3 $DB_PATH 'SELECT COUNT(*) FROM requests;'" +echo "" +echo "3. ๐Ÿ“Š Check file system issues:" +echo " ls -la $DB_PATH*" +echo " file $DB_PATH" +echo " hexdump -C $DB_PATH | head -5" +echo "" +echo "4. ๐Ÿ”„ Try different journal modes:" +echo " sqlite3 $DB_PATH 'PRAGMA journal_mode=DELETE; VACUUM;'" +echo " sqlite3 $DB_PATH 'PRAGMA journal_mode=WAL;'" +echo "" +echo "โš ๏ธ DO NOT DELETE DATABASE FILES WITHOUT MANUAL REVIEW" +echo "๐Ÿ“ All backups saved in: $BACKUP_DIR" +echo "" +echo "๐Ÿ†˜ If all else fails, contact database administrator" +echo " Consider restoring from external backups if available"