Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 3 additions & 15 deletions apps/sim/app/api/files/upload/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,22 @@ import {
InvalidRequestError,
} from '@/app/api/files/utils'

// Allowlist of permitted file extensions for security
const ALLOWED_EXTENSIONS = new Set([
// Documents
'pdf',
'doc',
'docx',
'txt',
'md',
// Images (safe formats)
'png',
'jpg',
'jpeg',
'gif',
// Data files
'csv',
'xlsx',
'xls',
'json',
'yaml',
'yml',
])

/**
Expand All @@ -50,19 +49,16 @@ export async function POST(request: NextRequest) {

const formData = await request.formData()

// Check if multiple files are being uploaded or a single file
const files = formData.getAll('file') as File[]

if (!files || files.length === 0) {
throw new InvalidRequestError('No files provided')
}

// Get optional scoping parameters for execution-scoped storage
const workflowId = formData.get('workflowId') as string | null
const executionId = formData.get('executionId') as string | null
const workspaceId = formData.get('workspaceId') as string | null

// Log storage mode
const usingCloudStorage = isUsingCloudStorage()
logger.info(`Using storage mode: ${usingCloudStorage ? 'Cloud' : 'Local'} for file upload`)

Expand All @@ -74,7 +70,6 @@ export async function POST(request: NextRequest) {

const uploadResults = []

// Process each file
for (const file of files) {
const originalName = file.name

Expand All @@ -88,9 +83,7 @@ export async function POST(request: NextRequest) {
const bytes = await file.arrayBuffer()
const buffer = Buffer.from(bytes)

// For execution-scoped files, use the dedicated execution file storage
if (workflowId && executionId) {
// Use the dedicated execution file storage system
const { uploadExecutionFile } = await import('@/lib/workflows/execution-file-storage')
const userFile = await uploadExecutionFile(
{
Expand All @@ -107,13 +100,10 @@ export async function POST(request: NextRequest) {
continue
}

// Upload to cloud or local storage using the standard uploadFile function
try {
logger.info(`Uploading file: ${originalName}`)
const result = await uploadFile(buffer, originalName, file.type, file.size)

// Generate a presigned URL for cloud storage with appropriate expiry
// Regular files get 24 hours (execution files are handled above)
let presignedUrl: string | undefined
if (usingCloudStorage) {
try {
Expand Down Expand Up @@ -144,7 +134,6 @@ export async function POST(request: NextRequest) {
}
}

// Return all file information
if (uploadResults.length === 1) {
return NextResponse.json(uploadResults[0])
}
Expand All @@ -155,7 +144,6 @@ export async function POST(request: NextRequest) {
}
}

// Handle preflight requests
export async function OPTIONS() {
return createOptionsResponse()
}
1 change: 1 addition & 0 deletions apps/sim/app/api/knowledge/search/utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ vi.stubGlobal(

vi.mock('@/lib/env', () => ({
env: {},
getEnv: (key: string) => process.env[key],
isTruthy: (value: string | boolean | number | undefined) =>
typeof value === 'string' ? value === 'true' || value === '1' : Boolean(value),
}))
Expand Down
1 change: 1 addition & 0 deletions apps/sim/app/api/knowledge/utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ vi.mock('drizzle-orm', () => ({

vi.mock('@/lib/env', () => ({
env: { OPENAI_API_KEY: 'test-key' },
getEnv: (key: string) => process.env[key],
isTruthy: (value: string | boolean | number | undefined) =>
typeof value === 'string' ? value === 'true' || value === '1' : Boolean(value),
}))
Expand Down
57 changes: 41 additions & 16 deletions apps/sim/lib/chunkers/json-yaml-chunker.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,29 @@
import * as yaml from 'js-yaml'
import { createLogger } from '@/lib/logs/console/logger'
import { getAccurateTokenCount } from '@/lib/tokenization'
import { estimateTokenCount } from '@/lib/tokenization/estimators'
import type { Chunk, ChunkerOptions } from './types'

const logger = createLogger('JsonYamlChunker')

function getTokenCount(text: string): number {
const estimate = estimateTokenCount(text)
return estimate.count
try {
return getAccurateTokenCount(text, 'text-embedding-3-small')
} catch (error) {
logger.warn('Tiktoken failed, falling back to estimation')
const estimate = estimateTokenCount(text)
return estimate.count
}
}

/**
* Configuration for JSON/YAML chunking
* Reduced limits to ensure we stay well under OpenAI's 8,191 token limit per embedding request
*/
const JSON_YAML_CHUNKING_CONFIG = {
TARGET_CHUNK_SIZE: 2000, // Target tokens per chunk
TARGET_CHUNK_SIZE: 1000, // Target tokens per chunk
MIN_CHUNK_SIZE: 100, // Minimum tokens per chunk
MAX_CHUNK_SIZE: 3000, // Maximum tokens per chunk
MAX_CHUNK_SIZE: 1500, // Maximum tokens per chunk
MAX_DEPTH_FOR_SPLITTING: 5, // Maximum depth to traverse for splitting
}

Expand All @@ -34,7 +45,6 @@ export class JsonYamlChunker {
return true
} catch {
try {
const yaml = require('js-yaml')
yaml.load(content)
return true
} catch {
Expand All @@ -48,9 +58,26 @@ export class JsonYamlChunker {
*/
async chunk(content: string): Promise<Chunk[]> {
try {
const data = JSON.parse(content)
return this.chunkStructuredData(data)
let data: any
try {
data = JSON.parse(content)
} catch {
data = yaml.load(content)
}
const chunks = this.chunkStructuredData(data)

const tokenCounts = chunks.map((c) => c.tokenCount)
const totalTokens = tokenCounts.reduce((a, b) => a + b, 0)
const maxTokens = Math.max(...tokenCounts)
const avgTokens = Math.round(totalTokens / chunks.length)

logger.info(
`JSON chunking complete: ${chunks.length} chunks, ${totalTokens} total tokens (avg: ${avgTokens}, max: ${maxTokens})`
)

return chunks
} catch (error) {
logger.info('JSON parsing failed, falling back to text chunking')
return this.chunkAsText(content)
}
}
Expand Down Expand Up @@ -102,7 +129,6 @@ export class JsonYamlChunker {
const itemTokens = getTokenCount(itemStr)

if (itemTokens > this.chunkSize) {
// Save current batch if it has items
if (currentBatch.length > 0) {
const batchContent = contextHeader + JSON.stringify(currentBatch, null, 2)
chunks.push({
Expand Down Expand Up @@ -134,7 +160,7 @@ export class JsonYamlChunker {
const batchContent = contextHeader + JSON.stringify(currentBatch, null, 2)
chunks.push({
text: batchContent,
tokenCount: currentTokens,
tokenCount: getTokenCount(batchContent),
metadata: {
startIndex: i - currentBatch.length,
endIndex: i - 1,
Expand All @@ -152,7 +178,7 @@ export class JsonYamlChunker {
const batchContent = contextHeader + JSON.stringify(currentBatch, null, 2)
chunks.push({
text: batchContent,
tokenCount: currentTokens,
tokenCount: getTokenCount(batchContent),
metadata: {
startIndex: arr.length - currentBatch.length,
endIndex: arr.length - 1,
Expand Down Expand Up @@ -194,12 +220,11 @@ export class JsonYamlChunker {
const valueTokens = getTokenCount(valueStr)

if (valueTokens > this.chunkSize) {
// Save current object if it has properties
if (Object.keys(currentObj).length > 0) {
const objContent = JSON.stringify(currentObj, null, 2)
chunks.push({
text: objContent,
tokenCount: currentTokens,
tokenCount: getTokenCount(objContent),
metadata: {
startIndex: 0,
endIndex: objContent.length,
Expand Down Expand Up @@ -230,7 +255,7 @@ export class JsonYamlChunker {
const objContent = JSON.stringify(currentObj, null, 2)
chunks.push({
text: objContent,
tokenCount: currentTokens,
tokenCount: getTokenCount(objContent),
metadata: {
startIndex: 0,
endIndex: objContent.length,
Expand All @@ -250,7 +275,7 @@ export class JsonYamlChunker {
const objContent = JSON.stringify(currentObj, null, 2)
chunks.push({
text: objContent,
tokenCount: currentTokens,
tokenCount: getTokenCount(objContent),
metadata: {
startIndex: 0,
endIndex: objContent.length,
Expand All @@ -262,7 +287,7 @@ export class JsonYamlChunker {
}

/**
* Fall back to text chunking if JSON parsing fails.
* Fall back to text chunking if JSON parsing fails
*/
private async chunkAsText(content: string): Promise<Chunk[]> {
const chunks: Chunk[] = []
Expand Down Expand Up @@ -308,7 +333,7 @@ export class JsonYamlChunker {
}

/**
* Static method for chunking JSON/YAML data with default options.
* Static method for chunking JSON/YAML data with default options
*/
static async chunkJsonYaml(content: string, options: ChunkerOptions = {}): Promise<Chunk[]> {
const chunker = new JsonYamlChunker(options)
Expand Down
44 changes: 33 additions & 11 deletions apps/sim/lib/embeddings/utils.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import { env } from '@/lib/env'
import { isRetryableError, retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils'
import { createLogger } from '@/lib/logs/console/logger'
import { batchByTokenLimit, getTotalTokenCount } from '@/lib/tokenization'

const logger = createLogger('EmbeddingUtils')

const MAX_TOKENS_PER_REQUEST = 8000

export class EmbeddingAPIError extends Error {
public status: number

Expand Down Expand Up @@ -104,35 +107,54 @@ async function callEmbeddingAPI(inputs: string[], config: EmbeddingConfig): Prom
}

/**
* Generate embeddings for multiple texts with simple batching
* Generate embeddings for multiple texts with token-aware batching
* Uses tiktoken for token counting
*/
export async function generateEmbeddings(
texts: string[],
embeddingModel = 'text-embedding-3-small'
): Promise<number[][]> {
const config = getEmbeddingConfig(embeddingModel)

logger.info(`Using ${config.useAzure ? 'Azure OpenAI' : 'OpenAI'} for embeddings generation`)
logger.info(
`Using ${config.useAzure ? 'Azure OpenAI' : 'OpenAI'} for embeddings generation (${texts.length} texts)`
)

const batches = batchByTokenLimit(texts, MAX_TOKENS_PER_REQUEST, embeddingModel)

logger.info(
`Split ${texts.length} texts into ${batches.length} batches (max ${MAX_TOKENS_PER_REQUEST} tokens per batch)`
)

// Reduced batch size to prevent API timeouts and improve reliability
const batchSize = 50 // Reduced from 100 to prevent issues with large documents
const allEmbeddings: number[][] = []

for (let i = 0; i < texts.length; i += batchSize) {
const batch = texts.slice(i, i + batchSize)
const batchEmbeddings = await callEmbeddingAPI(batch, config)
allEmbeddings.push(...batchEmbeddings)
for (let i = 0; i < batches.length; i++) {
const batch = batches[i]
const batchTokenCount = getTotalTokenCount(batch, embeddingModel)

logger.info(
`Generated embeddings for batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(texts.length / batchSize)}`
`Processing batch ${i + 1}/${batches.length}: ${batch.length} texts, ${batchTokenCount} tokens`
)

// Add small delay between batches to avoid rate limiting
if (i + batchSize < texts.length) {
try {
const batchEmbeddings = await callEmbeddingAPI(batch, config)
allEmbeddings.push(...batchEmbeddings)

logger.info(
`Generated ${batchEmbeddings.length} embeddings for batch ${i + 1}/${batches.length}`
)
} catch (error) {
logger.error(`Failed to generate embeddings for batch ${i + 1}:`, error)
throw error
}

if (i + 1 < batches.length) {
await new Promise((resolve) => setTimeout(resolve, 100))
}
}

logger.info(`Successfully generated ${allEmbeddings.length} embeddings total`)

return allEmbeddings
}

Expand Down
Loading