From f67615610c54fe6ac8ff377285bc09aa81a0b0ff Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 26 Jan 2025 21:06:56 -0300 Subject: [PATCH 01/11] Reapply "Nick: extract api reference" This reverts commit 61d7ba76f76ce74e0d230f89a93436f29dc8d9df. --- apps/api/src/controllers/v1/extract-status.ts | 2 + apps/api/src/controllers/v1/types.ts | 4 + apps/api/src/lib/extract/extract-redis.ts | 3 + .../api/src/lib/extract/extraction-service.ts | 78 +++++++++++++++++++ apps/api/src/services/rate-limiter.ts | 1 + 5 files changed, 88 insertions(+) diff --git a/apps/api/src/controllers/v1/extract-status.ts b/apps/api/src/controllers/v1/extract-status.ts index 3e166e5fd3..da3c08f137 100644 --- a/apps/api/src/controllers/v1/extract-status.ts +++ b/apps/api/src/controllers/v1/extract-status.ts @@ -30,6 +30,7 @@ export async function extractStatusController( data = jobData[0].docs; } + console.log(extract.sources); return res.status(200).json({ success: extract.status === "failed" ? false : true, data: data, @@ -38,5 +39,6 @@ export async function extractStatusController( expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(), steps: extract.showSteps ? extract.steps : undefined, llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined, + sources: extract.sources, }); } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 13b141168c..e86f209eab 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -537,6 +537,7 @@ export interface URLTrace { }; relevanceScore?: number; usedInCompletion?: boolean; + extractedFields?: string[]; } export interface ExtractResponse { @@ -547,6 +548,9 @@ export interface ExtractResponse { id?: string; warning?: string; urlTrace?: URLTrace[]; + sources?: { + [key: string]: string[]; + }; } export interface ExtractResponseRequestTest { diff --git a/apps/api/src/lib/extract/extract-redis.ts b/apps/api/src/lib/extract/extract-redis.ts index a6a3d6d3d5..39cf23640d 100644 --- a/apps/api/src/lib/extract/extract-redis.ts +++ b/apps/api/src/lib/extract/extract-redis.ts @@ -32,6 +32,9 @@ export type StoredExtract = { steps?: ExtractedStep[]; showLLMUsage?: boolean; llmUsage?: number; + sources?: { + [key: string]: string[]; + }; }; // Reduce TTL to 6 hours instead of 24 diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 0ac3024581..577daaa540 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -56,6 +56,9 @@ interface ExtractResult { tokenUsageBreakdown?: TokenUsage[]; llmUsage?: number; totalUrlsScraped?: number; + sources?: { + [key: string]: string[]; + }; } async function analyzeSchemaAndPrompt( @@ -179,6 +182,45 @@ function getRootDomain(url: string): string { } } +// Add helper function to track sources +function trackFieldSources(data: any, url: string, parentPath: string = ''): string[] { + const extractedFields: string[] = []; + + if (data && typeof data === 'object') { + Object.entries(data).forEach(([key, value]) => { + const currentPath = parentPath ? `${parentPath}.${key}` : key; + + if (value !== null && value !== undefined) { + extractedFields.push(currentPath); + + if (typeof value === 'object') { + extractedFields.push(...trackFieldSources(value, url, currentPath)); + } + } + }); + } + + return extractedFields; +} + +// Add helper to merge sources from multiple extractions +function mergeSources(sources: { [key: string]: string[] }[]): { [key: string]: string[] } { + const mergedSources: { [key: string]: string[] } = {}; + + sources.forEach(sourceMap => { + Object.entries(sourceMap).forEach(([field, urls]) => { + if (!mergedSources[field]) { + mergedSources[field] = []; + } + mergedSources[field].push(...urls); + // Deduplicate URLs + mergedSources[field] = [...new Set(mergedSources[field])]; + }); + }); + + return mergedSources; +} + export async function performExtraction( extractId: string, options: ExtractServiceOptions, @@ -191,6 +233,7 @@ export async function performExtraction( let multiEntityResult: any = {}; let singleAnswerResult: any = {}; let totalUrlsScraped = 0; + let extractionSources: { [key: string]: string[] } = {}; const logger = _logger.child({ module: "extract", @@ -551,6 +594,24 @@ export async function performExtraction( // return null; // } + if (multiEntityCompletion?.extract) { + const extractedFields = trackFieldSources(multiEntityCompletion.extract, doc.metadata.url || doc.metadata.sourceURL!); + + // Update URL trace with extracted fields + const trace = urlTraces.find(t => t.url === (doc.metadata.url || doc.metadata.sourceURL!)); + if (trace) { + trace.extractedFields = extractedFields; + } + + // Track sources for each field + extractedFields.forEach(field => { + if (!extractionSources[field]) { + extractionSources[field] = []; + } + extractionSources[field].push(doc.metadata.url || doc.metadata.sourceURL!); + }); + } + return multiEntityCompletion.extract; } catch (error) { logger.error(`Failed to process document.`, { error, url: doc.metadata.url ?? doc.metadata.sourceURL! }); @@ -727,6 +788,21 @@ export async function performExtraction( // } // }); // } + + if (singleAnswerCompletions?.extract) { + const singleAnswerSources: { [key: string]: string[] } = {}; + const usedUrls = Array.from(docsMap.values()) + .map(doc => doc.metadata.url || doc.metadata.sourceURL!) + .filter(Boolean); + + const extractedFields = trackFieldSources(singleAnswerCompletions.extract, ''); + extractedFields.forEach(field => { + singleAnswerSources[field] = usedUrls; + }); + + // Merge with multi-entity sources + extractionSources = mergeSources([extractionSources, singleAnswerSources]); + } } let finalResult = reqSchema @@ -817,6 +893,7 @@ export async function performExtraction( updateExtract(extractId, { status: "completed", llmUsage, + sources: extractionSources }).catch((error) => { logger.error( `Failed to update extract ${extractId} status to completed: ${error}`, @@ -834,5 +911,6 @@ export async function performExtraction( urlTrace: request.urlTrace ? urlTraces : undefined, llmUsage, totalUrlsScraped, + sources: extractionSources }; } diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 68cea8faa8..f54019e1b7 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -227,6 +227,7 @@ export function getRateLimiterPoints( const points: number = rateLimitConfig[makePlanKey(plan)] || rateLimitConfig.default; // 5 + return points; } From 61a142c60ab6d98cf2d587d362148e2339f5f34a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 26 Jan 2025 21:10:52 -0300 Subject: [PATCH 02/11] Nick: refactor analyzer --- apps/api/src/lib/extract/build-prompts.ts | 34 ++++++ .../completions/analyzeSchemaAndPrompt.ts | 94 +++++++++++++++ .../api/src/lib/extract/extraction-service.ts | 109 +----------------- 3 files changed, 130 insertions(+), 107 deletions(-) create mode 100644 apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index ac5da8d80f..4cb498571c 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -42,3 +42,37 @@ to determine their relevance to the user's query and intent. export function buildRerankerUserPrompt(searchQuery: string): string { return `Given these URLs, rank which ones are relevant to the user's extraction intent: "${searchQuery}".`; } + +// Multi entity schema anlayzer + +export function buildAnalyzeSchemaPrompt(): string { + return `You are a query classifier for a web scraping system. Classify the data extraction query as either: +A) Single-Answer: One answer across a few pages, possibly containing small arrays. +B) Multi-Entity: Many items across many pages, often involving large arrays. + +Consider: +1. Answer Cardinality: Single or multiple items? +2. Page Distribution: Found on 1-3 pages or many? +3. Verification Needs: Cross-page verification or independent extraction? + +Provide: +- Method: [Single-Answer/Multi-Entity] +- Confidence: [0-100%] +- Reasoning: Why this classification? +- Key Indicators: Specific aspects leading to this decision. + +Examples: +- "Is this company a non-profit?" -> Single-Answer +- "Extract all product prices" -> Multi-Entity + +For Single-Answer, arrays may be present but are typically small. For Multi-Entity, if arrays have multiple items not from a single page, return keys with large arrays. If nested, return the full key (e.g., 'ecommerce.products').`; +} + +export function buildAnalyzeSchemaUserPrompt( + schemaString: string, + prompt: string, + urls: string[], +): string { + return `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none: +Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`; +} diff --git a/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts new file mode 100644 index 0000000000..b44fc5cfbb --- /dev/null +++ b/apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts @@ -0,0 +1,94 @@ +import { generateSchemaFromPrompt } from "../../../scraper/scrapeURL/transformers/llmExtract"; +import { TokenUsage } from "../../../controllers/v1/types"; +import { z } from "zod"; +import { + buildAnalyzeSchemaPrompt, + buildAnalyzeSchemaUserPrompt, +} from "../build-prompts"; +import OpenAI from "openai"; +const openai = new OpenAI(); + +export async function analyzeSchemaAndPrompt( + urls: string[], + schema: any, + prompt: string, +): Promise<{ + isMultiEntity: boolean; + multiEntityKeys: string[]; + reasoning?: string; + keyIndicators?: string[]; + tokenUsage: TokenUsage; +}> { + if (!schema) { + schema = await generateSchemaFromPrompt(prompt); + } + + const schemaString = JSON.stringify(schema); + + const checkSchema = z + .object({ + isMultiEntity: z.boolean(), + multiEntityKeys: z.array(z.string()).optional().default([]), + reasoning: z.string(), + keyIndicators: z.array(z.string()), + }) + .refine( + (x) => !x.isMultiEntity || x.multiEntityKeys.length > 0, + "isMultiEntity was true, but no multiEntityKeys", + ); + + const model = "gpt-4o"; + + const result = await openai.beta.chat.completions.parse({ + model: model, + messages: [ + { + role: "system", + content: buildAnalyzeSchemaPrompt(), + }, + { + role: "user", + content: buildAnalyzeSchemaUserPrompt(schemaString, prompt, urls), + }, + ], + response_format: { + type: "json_schema", + json_schema: { + schema: { + type: "object", + properties: { + isMultiEntity: { type: "boolean" }, + multiEntityKeys: { type: "array", items: { type: "string" } }, + reasoning: { type: "string" }, + keyIndicators: { type: "array", items: { type: "string" } }, + }, + required: [ + "isMultiEntity", + "multiEntityKeys", + "reasoning", + "keyIndicators", + ], + additionalProperties: false, + }, + name: "checkSchema", + }, + }, + }); + + const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } = + checkSchema.parse(result.choices[0].message.parsed); + + const tokenUsage: TokenUsage = { + promptTokens: result.usage?.prompt_tokens ?? 0, + completionTokens: result.usage?.completion_tokens ?? 0, + totalTokens: result.usage?.total_tokens ?? 0, + model: model, + }; + return { + isMultiEntity, + multiEntityKeys, + reasoning, + keyIndicators, + tokenUsage, + }; +} diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 577daaa540..987dfeb461 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -2,7 +2,6 @@ import { Document, ExtractRequest, TokenUsage, - toLegacyCrawlerOptions, URLTrace, } from "../../controllers/v1/types"; import { PlanType } from "../../types"; @@ -17,27 +16,23 @@ import { buildDocument } from "./build-document"; import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; import { _addScrapeJobToBullMQ } from "../../services/queue-jobs"; -import { saveCrawl, StoredCrawl } from "../crawl-redis"; import { dereferenceSchema } from "./helpers/dereference-schema"; -import { z } from "zod"; -import OpenAI from "openai"; import { spreadSchemas } from "./helpers/spread-schemas"; import { transformArrayToObject } from "./helpers/transform-array-to-obj"; import { mixSchemaObjects } from "./helpers/mix-schema-objs"; import Ajv from "ajv"; const ajv = new Ajv(); -const openai = new OpenAI(); import { ExtractStep, updateExtract } from "./extract-redis"; import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array"; import { mergeNullValObjs } from "./helpers/merge-null-val-objs"; -import { CUSTOM_U_TEAMS, extractConfig } from "./config"; +import { CUSTOM_U_TEAMS } from "./config"; import { calculateFinalResultCost, estimateCost, estimateTotalCost, } from "./usage/llm-cost"; -import { numTokensFromString } from "../LLM-extraction/helpers"; +import { analyzeSchemaAndPrompt } from "./completions/analyzeSchemaAndPrompt"; interface ExtractServiceOptions { request: ExtractRequest; @@ -61,107 +56,7 @@ interface ExtractResult { }; } -async function analyzeSchemaAndPrompt( - urls: string[], - schema: any, - prompt: string, -): Promise<{ - isMultiEntity: boolean; - multiEntityKeys: string[]; - reasoning?: string; - keyIndicators?: string[]; - tokenUsage: TokenUsage; -}> { - if (!schema) { - schema = await generateSchemaFromPrompt(prompt); - } - - const schemaString = JSON.stringify(schema); - - const checkSchema = z.object({ - isMultiEntity: z.boolean(), - multiEntityKeys: z.array(z.string()).optional().default([]), - reasoning: z.string(), - keyIndicators: z.array(z.string()), - }).refine(x => !x.isMultiEntity || x.multiEntityKeys.length > 0, "isMultiEntity was true, but no multiEntityKeys"); - - const model = "gpt-4o"; - - const result = await openai.beta.chat.completions.parse({ - model: model, - messages: [ - { - role: "system", - content: ` -You are a query classifier for a web scraping system. Classify the data extraction query as either: -A) Single-Answer: One answer across a few pages, possibly containing small arrays. -B) Multi-Entity: Many items across many pages, often involving large arrays. - -Consider: -1. Answer Cardinality: Single or multiple items? -2. Page Distribution: Found on 1-3 pages or many? -3. Verification Needs: Cross-page verification or independent extraction? - -Provide: -- Method: [Single-Answer/Multi-Entity] -- Confidence: [0-100%] -- Reasoning: Why this classification? -- Key Indicators: Specific aspects leading to this decision. - -Examples: -- "Is this company a non-profit?" -> Single-Answer -- "Extract all product prices" -> Multi-Entity - -For Single-Answer, arrays may be present but are typically small. For Multi-Entity, if arrays have multiple items not from a single page, return keys with large arrays. If nested, return the full key (e.g., 'ecommerce.products'). - `, - }, - { - role: "user", - content: `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none: -Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`, - }, - ], - response_format: { - type: "json_schema", - json_schema: { - schema: { - type: "object", - properties: { - isMultiEntity: { type: "boolean" }, - multiEntityKeys: { type: "array", items: { type: "string" } }, - reasoning: { type: "string" }, - keyIndicators: { type: "array", items: { type: "string" } }, - }, - required: [ - "isMultiEntity", - "multiEntityKeys", - "reasoning", - "keyIndicators", - ], - additionalProperties: false, - }, - name: "checkSchema", - }, - }, - }); - const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } = - checkSchema.parse(result.choices[0].message.parsed); - - const tokenUsage: TokenUsage = { - promptTokens: result.usage?.prompt_tokens ?? 0, - completionTokens: result.usage?.completion_tokens ?? 0, - totalTokens: result.usage?.total_tokens ?? 0, - model: model, - }; - return { - isMultiEntity, - multiEntityKeys, - reasoning, - keyIndicators, - tokenUsage, - }; -} type completions = { extract: Record; From 592ff4c7d437432973ab328adb7cece3c438af3d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 26 Jan 2025 21:11:03 -0300 Subject: [PATCH 03/11] Nick: formatting --- .../api/src/lib/extract/extraction-service.ts | 123 ++++++++++++------ 1 file changed, 86 insertions(+), 37 deletions(-) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 987dfeb461..37ff162726 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -56,8 +56,6 @@ interface ExtractResult { }; } - - type completions = { extract: Record; numTokens: number; @@ -78,31 +76,37 @@ function getRootDomain(url: string): string { } // Add helper function to track sources -function trackFieldSources(data: any, url: string, parentPath: string = ''): string[] { +function trackFieldSources( + data: any, + url: string, + parentPath: string = "", +): string[] { const extractedFields: string[] = []; - - if (data && typeof data === 'object') { + + if (data && typeof data === "object") { Object.entries(data).forEach(([key, value]) => { const currentPath = parentPath ? `${parentPath}.${key}` : key; - + if (value !== null && value !== undefined) { extractedFields.push(currentPath); - - if (typeof value === 'object') { + + if (typeof value === "object") { extractedFields.push(...trackFieldSources(value, url, currentPath)); } } }); } - + return extractedFields; } // Add helper to merge sources from multiple extractions -function mergeSources(sources: { [key: string]: string[] }[]): { [key: string]: string[] } { +function mergeSources(sources: { [key: string]: string[] }[]): { + [key: string]: string[]; +} { const mergedSources: { [key: string]: string[] } = {}; - - sources.forEach(sourceMap => { + + sources.forEach((sourceMap) => { Object.entries(sourceMap).forEach(([field, urls]) => { if (!mergedSources[field]) { mergedSources[field] = []; @@ -112,7 +116,7 @@ function mergeSources(sources: { [key: string]: string[] }[]): { [key: string]: mergedSources[field] = [...new Set(mergedSources[field])]; }); }); - + return mergedSources; } @@ -196,7 +200,7 @@ export async function performExtraction( if (links.length === 0) { logger.error("0 links! Bailing.", { - linkCount: links.length + linkCount: links.length, }); return { success: false, @@ -223,14 +227,20 @@ export async function performExtraction( let reqSchema = request.schema; if (!reqSchema && request.prompt) { reqSchema = await generateSchemaFromPrompt(request.prompt); - logger.debug("Generated request schema.", { originalSchema: request.schema, schema: reqSchema }); + logger.debug("Generated request schema.", { + originalSchema: request.schema, + schema: reqSchema, + }); } if (reqSchema) { reqSchema = await dereferenceSchema(reqSchema); } - logger.debug("Transformed schema.", { originalSchema: request.schema, schema: reqSchema }); + logger.debug("Transformed schema.", { + originalSchema: request.schema, + schema: reqSchema, + }); // agent evaluates if the schema or the prompt has an array with big amount of items // also it checks if the schema any other properties that are not arrays @@ -246,7 +256,12 @@ export async function performExtraction( tokenUsage: schemaAnalysisTokenUsage, } = await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? ""); - logger.debug("Analyzed schema.", { isMultiEntity, multiEntityKeys, reasoning, keyIndicators }); + logger.debug("Analyzed schema.", { + isMultiEntity, + multiEntityKeys, + reasoning, + keyIndicators, + }); // Track schema analysis tokens tokenUsage.push(schemaAnalysisTokenUsage); @@ -306,7 +321,12 @@ export async function performExtraction( timeout, }, urlTraces, - logger.child({ module: "extract", method: "scrapeDocument", url, isMultiEntity: true }), + logger.child({ + module: "extract", + method: "scrapeDocument", + url, + isMultiEntity: true, + }), ); } return docsMap.get(url); @@ -438,7 +458,8 @@ export async function performExtraction( (request.systemPrompt ? `${request.systemPrompt}\n` : "") + `Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` + links.join(", "), - prompt: "Today is: " + new Date().toISOString() + "\n" + request.prompt, + prompt: + "Today is: " + new Date().toISOString() + "\n" + request.prompt, schema: multiEntitySchema, }, buildDocument(doc), @@ -490,26 +511,36 @@ export async function performExtraction( // } if (multiEntityCompletion?.extract) { - const extractedFields = trackFieldSources(multiEntityCompletion.extract, doc.metadata.url || doc.metadata.sourceURL!); - + const extractedFields = trackFieldSources( + multiEntityCompletion.extract, + doc.metadata.url || doc.metadata.sourceURL!, + ); + // Update URL trace with extracted fields - const trace = urlTraces.find(t => t.url === (doc.metadata.url || doc.metadata.sourceURL!)); + const trace = urlTraces.find( + (t) => t.url === (doc.metadata.url || doc.metadata.sourceURL!), + ); if (trace) { trace.extractedFields = extractedFields; } - + // Track sources for each field - extractedFields.forEach(field => { + extractedFields.forEach((field) => { if (!extractionSources[field]) { extractionSources[field] = []; } - extractionSources[field].push(doc.metadata.url || doc.metadata.sourceURL!); + extractionSources[field].push( + doc.metadata.url || doc.metadata.sourceURL!, + ); }); } return multiEntityCompletion.extract; } catch (error) { - logger.error(`Failed to process document.`, { error, url: doc.metadata.url ?? doc.metadata.sourceURL! }); + logger.error(`Failed to process document.`, { + error, + url: doc.metadata.url ?? doc.metadata.sourceURL!, + }); return null; } }); @@ -519,7 +550,9 @@ export async function performExtraction( multiEntityCompletions.push( ...chunkResults.filter((result) => result !== null), ); - logger.debug("All multi-entity completion chunks finished.", { completionCount: multiEntityCompletions.length }); + logger.debug("All multi-entity completion chunks finished.", { + completionCount: multiEntityCompletions.length, + }); } try { @@ -581,7 +614,12 @@ export async function performExtraction( timeout, }, urlTraces, - logger.child({ module: "extract", method: "scrapeDocument", url, isMultiEntity: false }) + logger.child({ + module: "extract", + method: "scrapeDocument", + url, + isMultiEntity: false, + }), ); } return docsMap.get(url); @@ -687,21 +725,32 @@ export async function performExtraction( if (singleAnswerCompletions?.extract) { const singleAnswerSources: { [key: string]: string[] } = {}; const usedUrls = Array.from(docsMap.values()) - .map(doc => doc.metadata.url || doc.metadata.sourceURL!) + .map((doc) => doc.metadata.url || doc.metadata.sourceURL!) .filter(Boolean); - - const extractedFields = trackFieldSources(singleAnswerCompletions.extract, ''); - extractedFields.forEach(field => { + + const extractedFields = trackFieldSources( + singleAnswerCompletions.extract, + "", + ); + extractedFields.forEach((field) => { singleAnswerSources[field] = usedUrls; }); - + // Merge with multi-entity sources - extractionSources = mergeSources([extractionSources, singleAnswerSources]); + extractionSources = mergeSources([ + extractionSources, + singleAnswerSources, + ]); } } let finalResult = reqSchema - ? await mixSchemaObjects(reqSchema, singleAnswerResult, multiEntityResult, logger.child({ method: "mixSchemaObjects" })) + ? await mixSchemaObjects( + reqSchema, + singleAnswerResult, + multiEntityResult, + logger.child({ method: "mixSchemaObjects" }), + ) : singleAnswerResult || multiEntityResult; // Tokenize final result to get token count @@ -788,7 +837,7 @@ export async function performExtraction( updateExtract(extractId, { status: "completed", llmUsage, - sources: extractionSources + sources: extractionSources, }).catch((error) => { logger.error( `Failed to update extract ${extractId} status to completed: ${error}`, @@ -806,6 +855,6 @@ export async function performExtraction( urlTrace: request.urlTrace ? urlTraces : undefined, llmUsage, totalUrlsScraped, - sources: extractionSources + sources: extractionSources, }; } From 04e08bb137e837fab83c46655ea623d38b3d21dd Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 26 Jan 2025 21:33:33 -0300 Subject: [PATCH 04/11] Nick: --- apps/api/src/lib/extract/build-prompts.ts | 31 +++++- .../lib/extract/completions/batchExtract.ts | 47 +++++++++ .../extract/completions/checkShouldExtract.ts | 40 ++++++++ .../lib/extract/completions/singleAnswer.ts | 34 +++++++ .../api/src/lib/extract/extraction-service.ts | 95 +++++-------------- 5 files changed, 174 insertions(+), 73 deletions(-) create mode 100644 apps/api/src/lib/extract/completions/batchExtract.ts create mode 100644 apps/api/src/lib/extract/completions/checkShouldExtract.ts create mode 100644 apps/api/src/lib/extract/completions/singleAnswer.ts diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index 4cb498571c..ebd6c7fea1 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -44,7 +44,6 @@ export function buildRerankerUserPrompt(searchQuery: string): string { } // Multi entity schema anlayzer - export function buildAnalyzeSchemaPrompt(): string { return `You are a query classifier for a web scraping system. Classify the data extraction query as either: A) Single-Answer: One answer across a few pages, possibly containing small arrays. @@ -76,3 +75,33 @@ export function buildAnalyzeSchemaUserPrompt( return `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none: Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`; } + +// Should Extract + +export function buildShouldExtractSystemPrompt(): string { + return `You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.`; +} + +export function buildShouldExtractUserPrompt( + prompt: string, + schema: any, +): string { + return `Should the following content be used to extract information for this prompt: "${prompt}" User schema is: ${JSON.stringify(schema)}\nReturn only true or false.`; +} + +// Batch extract +export function buildBatchExtractSystemPrompt( + systemPrompt: string, + multiEntitySchema: any, + links: string[], +): string { + return ( + (systemPrompt ? `${systemPrompt}\n` : "") + + `Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` + + links.join(", ") + ); +} + +export function buildBatchExtractPrompt(prompt: string): string { + return `Today is: ${new Date().toISOString()}\n${prompt}`; +} diff --git a/apps/api/src/lib/extract/completions/batchExtract.ts b/apps/api/src/lib/extract/completions/batchExtract.ts new file mode 100644 index 0000000000..afed4a3a3e --- /dev/null +++ b/apps/api/src/lib/extract/completions/batchExtract.ts @@ -0,0 +1,47 @@ +import { logger } from "../../../lib/logger"; +import { generateOpenAICompletions } from "../../../scraper/scrapeURL/transformers/llmExtract"; +import { buildDocument } from "../build-document"; +import { ExtractResponse, TokenUsage } from "../../../controllers/v1/types"; +import { Document } from "../../../controllers/v1/types"; +import { + buildBatchExtractPrompt, + buildBatchExtractSystemPrompt, +} from "../build-prompts"; + +/** + * Batch extract information from a list of URLs using a multi-entity schema. + * @param multiEntitySchema - The schema for the multi-entity extraction + * @param links - The URLs to extract information from + * @param prompt - The prompt for the extraction + * @param systemPrompt - The system prompt for the extraction + * @param doc - The document to extract information from + * @returns The completion promise + */ +export function batchExtractPromise( + multiEntitySchema: any, + links: string[], + prompt: string, + systemPrompt: string, + doc: Document, +): Promise>> { + const completionPromise = generateOpenAICompletions( + logger.child({ + method: "extractService/generateOpenAICompletions", + }), + { + mode: "llm", + systemPrompt: buildBatchExtractSystemPrompt( + systemPrompt, + multiEntitySchema, + links, + ), + prompt: buildBatchExtractPrompt(prompt), + schema: multiEntitySchema, + }, + buildDocument(doc), + undefined, + true, + ); + + return completionPromise; +} diff --git a/apps/api/src/lib/extract/completions/checkShouldExtract.ts b/apps/api/src/lib/extract/completions/checkShouldExtract.ts new file mode 100644 index 0000000000..e4dafb44f3 --- /dev/null +++ b/apps/api/src/lib/extract/completions/checkShouldExtract.ts @@ -0,0 +1,40 @@ +import { logger } from "../../../lib/logger"; +import { buildDocument } from "../build-document"; +import { Document, TokenUsage } from "../../../controllers/v1/types"; +import { generateOpenAICompletions } from "../../../scraper/scrapeURL/transformers/llmExtract"; +import { + buildShouldExtractSystemPrompt, + buildShouldExtractUserPrompt, +} from "../build-prompts"; + +export async function checkShouldExtract( + prompt: string, + multiEntitySchema: any, + doc: Document, +): Promise<{ tokenUsage: TokenUsage; extract: boolean }> { + const shouldExtractCheck = await generateOpenAICompletions( + logger.child({ method: "extractService/checkShouldExtract" }), + { + mode: "llm", + systemPrompt: buildShouldExtractSystemPrompt(), + prompt: buildShouldExtractUserPrompt(prompt, multiEntitySchema), + schema: { + type: "object", + properties: { + extract: { + type: "boolean", + }, + }, + required: ["extract"], + }, + }, + buildDocument(doc), + undefined, + true, + ); + + return { + tokenUsage: shouldExtractCheck.totalUsage, + extract: shouldExtractCheck.extract["extract"], + }; +} diff --git a/apps/api/src/lib/extract/completions/singleAnswer.ts b/apps/api/src/lib/extract/completions/singleAnswer.ts new file mode 100644 index 0000000000..ac36b034c3 --- /dev/null +++ b/apps/api/src/lib/extract/completions/singleAnswer.ts @@ -0,0 +1,34 @@ +import { logger } from "../../../lib/logger"; +import { generateOpenAICompletions } from "../../../scraper/scrapeURL/transformers/llmExtract"; +import { buildDocument } from "../build-document"; +import { Document } from "../../../controllers/v1/types"; +export async function singleAnswerCompletion({ + singleAnswerDocs, + rSchema, + links, + prompt, + systemPrompt, +}: { + singleAnswerDocs: Document[]; + rSchema: any; + links: string[]; + prompt: string; + systemPrompt: string; +}) { + const completion = await generateOpenAICompletions( + logger.child({ module: "extract", method: "generateOpenAICompletions" }), + { + mode: "llm", + systemPrompt: + (systemPrompt ? `${systemPrompt}\n` : "") + + "Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Return 'null' the property that you don't find the information. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " + + links.join(", "), + prompt: "Today is: " + new Date().toISOString() + "\n" + prompt, + schema: rSchema, + }, + singleAnswerDocs.map((x) => buildDocument(x)).join("\n"), + undefined, + true, + ); + return { extract: completion.extract, tokenUsage: completion.totalUsage }; +} diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 37ff162726..513fa8e1d3 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -12,7 +12,6 @@ import { generateOpenAICompletions, generateSchemaFromPrompt, } from "../../scraper/scrapeURL/transformers/llmExtract"; -import { buildDocument } from "./build-document"; import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; import { _addScrapeJobToBullMQ } from "../../services/queue-jobs"; @@ -29,10 +28,12 @@ import { mergeNullValObjs } from "./helpers/merge-null-val-objs"; import { CUSTOM_U_TEAMS } from "./config"; import { calculateFinalResultCost, - estimateCost, estimateTotalCost, } from "./usage/llm-cost"; import { analyzeSchemaAndPrompt } from "./completions/analyzeSchemaAndPrompt"; +import { checkShouldExtract } from "./completions/checkShouldExtract"; +import { batchExtractPromise } from "./completions/batchExtract"; +import { singleAnswerCompletion } from "./completions/singleAnswer"; interface ExtractServiceOptions { request: ExtractRequest; @@ -63,17 +64,6 @@ type completions = { warning?: string; }; -function getRootDomain(url: string): string { - try { - if (url.endsWith("/*")) { - url = url.slice(0, -2); - } - const urlObj = new URL(url); - return `${urlObj.protocol}//${urlObj.hostname}`; - } catch (e) { - return url; - } -} // Add helper function to track sources function trackFieldSources( @@ -160,6 +150,7 @@ export async function performExtraction( logger.debug("Processing URLs...", { urlCount: request.urls.length, }); + // Process URLs const urlPromises = request.urls.map((url) => processUrl( @@ -385,32 +376,16 @@ export async function performExtraction( setTimeout(() => resolve(null), timeoutCompletion); }); - // // Check if page should be extracted before proceeding - const shouldExtractCheck = await generateOpenAICompletions( - logger.child({ method: "extractService/checkShouldExtract" }), - { - mode: "llm", - systemPrompt: - "You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.", - prompt: `Should the following content be used to extract information for this prompt: "${request.prompt}" User schema is: ${JSON.stringify(multiEntitySchema)}\nReturn only true or false.`, - schema: { - type: "object", - properties: { - extract: { - type: "boolean", - }, - }, - required: ["extract"], - }, - }, - buildDocument(doc), - undefined, - true, + // Check if page should be extracted before proceeding + const { extract, tokenUsage: shouldExtractCheckTokenUsage } = await checkShouldExtract( + request.prompt ?? "", + multiEntitySchema, + doc, ); - tokenUsage.push(shouldExtractCheck.totalUsage); + tokenUsage.push(shouldExtractCheckTokenUsage); - if (!shouldExtractCheck.extract["extract"]) { + if (!extract) { console.log( `Skipping extraction for ${doc.metadata.url} as content is irrelevant`, ); @@ -448,24 +423,7 @@ export async function performExtraction( ], }); - const completionPromise = generateOpenAICompletions( - logger.child({ - method: "extractService/generateOpenAICompletions", - }), - { - mode: "llm", - systemPrompt: - (request.systemPrompt ? `${request.systemPrompt}\n` : "") + - `Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` + - links.join(", "), - prompt: - "Today is: " + new Date().toISOString() + "\n" + request.prompt, - schema: multiEntitySchema, - }, - buildDocument(doc), - undefined, - true, - ); + const completionPromise = batchExtractPromise(multiEntitySchema, links, request.prompt ?? "", request.systemPrompt ?? "", doc); // Race between timeout and completion const multiEntityCompletion = (await Promise.race([ @@ -679,29 +637,22 @@ export async function performExtraction( // Generate completions logger.debug("Generating singleAnswer completions..."); - singleAnswerCompletions = await generateOpenAICompletions( - logger.child({ module: "extract", method: "generateOpenAICompletions" }), - { - mode: "llm", - systemPrompt: - (request.systemPrompt ? `${request.systemPrompt}\n` : "") + - "Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Return 'null' the property that you don't find the information. Be concise and follow the schema always if provided. Here are the urls the user provided of which he wants to extract information from: " + - links.join(", "), - prompt: "Today is: " + new Date().toISOString() + "\n" + request.prompt, - schema: rSchema, - }, - singleAnswerDocs.map((x) => buildDocument(x)).join("\n"), - undefined, - true, - ); + let { extract: singleAnswerResult, tokenUsage: singleAnswerTokenUsage } = await singleAnswerCompletion({ + singleAnswerDocs, + rSchema, + links, + prompt: request.prompt ?? "", + systemPrompt: request.systemPrompt ?? "", + }); logger.debug("Done generating singleAnswer completions."); // Track single answer extraction tokens - if (singleAnswerCompletions) { - tokenUsage.push(singleAnswerCompletions.totalUsage); + if (singleAnswerTokenUsage) { + tokenUsage.push(singleAnswerTokenUsage); } - singleAnswerResult = singleAnswerCompletions.extract; + singleAnswerResult = singleAnswerResult.extract; + singleAnswerCompletions = singleAnswerResult; // Update token usage in traces // if (completions && completions.numTokens) { From 9d1802de9dfddc3d4bd8c8bc4614e7be69f974da Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 26 Jan 2025 22:08:46 -0300 Subject: [PATCH 05/11] Update extraction-service.ts --- .../api/src/lib/extract/extraction-service.ts | 97 ------------------- 1 file changed, 97 deletions(-) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 513fa8e1d3..1387ec19a8 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -52,9 +52,6 @@ interface ExtractResult { tokenUsageBreakdown?: TokenUsage[]; llmUsage?: number; totalUrlsScraped?: number; - sources?: { - [key: string]: string[]; - }; } type completions = { @@ -65,51 +62,6 @@ type completions = { }; -// Add helper function to track sources -function trackFieldSources( - data: any, - url: string, - parentPath: string = "", -): string[] { - const extractedFields: string[] = []; - - if (data && typeof data === "object") { - Object.entries(data).forEach(([key, value]) => { - const currentPath = parentPath ? `${parentPath}.${key}` : key; - - if (value !== null && value !== undefined) { - extractedFields.push(currentPath); - - if (typeof value === "object") { - extractedFields.push(...trackFieldSources(value, url, currentPath)); - } - } - }); - } - - return extractedFields; -} - -// Add helper to merge sources from multiple extractions -function mergeSources(sources: { [key: string]: string[] }[]): { - [key: string]: string[]; -} { - const mergedSources: { [key: string]: string[] } = {}; - - sources.forEach((sourceMap) => { - Object.entries(sourceMap).forEach(([field, urls]) => { - if (!mergedSources[field]) { - mergedSources[field] = []; - } - mergedSources[field].push(...urls); - // Deduplicate URLs - mergedSources[field] = [...new Set(mergedSources[field])]; - }); - }); - - return mergedSources; -} - export async function performExtraction( extractId: string, options: ExtractServiceOptions, @@ -122,7 +74,6 @@ export async function performExtraction( let multiEntityResult: any = {}; let singleAnswerResult: any = {}; let totalUrlsScraped = 0; - let extractionSources: { [key: string]: string[] } = {}; const logger = _logger.child({ module: "extract", @@ -468,31 +419,6 @@ export async function performExtraction( // return null; // } - if (multiEntityCompletion?.extract) { - const extractedFields = trackFieldSources( - multiEntityCompletion.extract, - doc.metadata.url || doc.metadata.sourceURL!, - ); - - // Update URL trace with extracted fields - const trace = urlTraces.find( - (t) => t.url === (doc.metadata.url || doc.metadata.sourceURL!), - ); - if (trace) { - trace.extractedFields = extractedFields; - } - - // Track sources for each field - extractedFields.forEach((field) => { - if (!extractionSources[field]) { - extractionSources[field] = []; - } - extractionSources[field].push( - doc.metadata.url || doc.metadata.sourceURL!, - ); - }); - } - return multiEntityCompletion.extract; } catch (error) { logger.error(`Failed to process document.`, { @@ -672,27 +598,6 @@ export async function performExtraction( // } // }); // } - - if (singleAnswerCompletions?.extract) { - const singleAnswerSources: { [key: string]: string[] } = {}; - const usedUrls = Array.from(docsMap.values()) - .map((doc) => doc.metadata.url || doc.metadata.sourceURL!) - .filter(Boolean); - - const extractedFields = trackFieldSources( - singleAnswerCompletions.extract, - "", - ); - extractedFields.forEach((field) => { - singleAnswerSources[field] = usedUrls; - }); - - // Merge with multi-entity sources - extractionSources = mergeSources([ - extractionSources, - singleAnswerSources, - ]); - } } let finalResult = reqSchema @@ -788,7 +693,6 @@ export async function performExtraction( updateExtract(extractId, { status: "completed", llmUsage, - sources: extractionSources, }).catch((error) => { logger.error( `Failed to update extract ${extractId} status to completed: ${error}`, @@ -806,6 +710,5 @@ export async function performExtraction( urlTrace: request.urlTrace ? urlTraces : undefined, llmUsage, totalUrlsScraped, - sources: extractionSources, }; } From a4555af22d8162e0578c573a8dfe269f189a9ec7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 26 Jan 2025 22:39:23 -0300 Subject: [PATCH 06/11] Nick: fixes --- .../api/src/lib/__tests__/mix-schemas.test.ts | 156 ++++++++++++++++++ .../api/src/lib/extract/extraction-service.ts | 8 +- 2 files changed, 160 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/__tests__/mix-schemas.test.ts b/apps/api/src/lib/__tests__/mix-schemas.test.ts index f62d5016e7..5154bb1cba 100644 --- a/apps/api/src/lib/__tests__/mix-schemas.test.ts +++ b/apps/api/src/lib/__tests__/mix-schemas.test.ts @@ -1023,4 +1023,160 @@ describe("mixSchemaObjects function", () => { expect(finalResult).toEqual(singleAnswerResult); }); + + it("should handle empty objects correctly (id: 30)", async () => { + const originalSchema = { + type: "object", + properties: { + business_details: { + type: "object", + properties: { + name: { type: "string" }, + years_in_operation: { type: "string" }, + services_offered: { + type: "array", + items: { type: "string" } + }, + experience_highlights: { type: "string" } + }, + required: ["name"] + }, + management: { + type: "object", + properties: { + owner_name: { type: "string" }, + credentials: { + type: "array", + items: { type: "string" } + } + } + }, + contact_information: { + type: "object", + properties: { + address: { type: "string" }, + phone: { type: "string" } + } + }, + reputation: { + type: "object", + properties: { + client_feedback: { type: "string" }, + operational_quality: { type: "string" } + } + } + }, + required: ["business_details"] + }; + + const singleAnswerResult = { + business_details: { + name: "Red Hill Mobility Group", + years_in_operation: "12 years", + services_offered: [ + "Recovery equipment for military", + "Vehicle mobility solutions", + "Product development for military vehicles" + ], + experience_highlights: "More than 12 years of combined experience overseas on over 25 active combat deployments." + }, + management: { + owner_name: "", + credentials: [] + }, + contact_information: { + address: "659 Shell Drive, Spring Lake, NC 28390", + phone: "910-638-7836" + }, + reputation: { + client_feedback: "", + operational_quality: "" + } + }; + + const multiEntityResult = {}; + + const finalResult = await mixSchemaObjects( + originalSchema, + singleAnswerResult, + undefined + ); + + expect(finalResult).toEqual(singleAnswerResult); + }); + + it("should return single answer result when multi entity is undefined", async () => { + const originalSchema = { + type: "object", + properties: { + business_details: { + type: "object", + properties: { + name: { type: "string" }, + years_in_operation: { type: "string" }, + services_offered: { + type: "array", + items: { type: "string" } + }, + experience_highlights: { type: "string" } + }, + required: ["name"] + }, + management: { + type: "object", + properties: { + owner_name: { type: "string" }, + credentials: { + type: "array", + items: { type: "string" } + } + } + }, + contact_information: { + type: "object", + properties: { + address: { type: "string" }, + phone: { type: "string" } + } + }, + reputation: { + type: "object", + properties: { + client_feedback: { type: "string" }, + operational_quality: { type: "string" } + } + } + }, + required: ["business_details"] + }; + + const singleAnswerResult = { + business_details: { + name: "Red Hill Mobility Group", + years_in_operation: "12 years", + services_offered: [ + "Recovery equipment for military", + "Vehicle mobility solutions", + "Product development for military vehicles" + ], + experience_highlights: "More than 12 years of combined experience overseas on over 25 active combat deployments." + }, + management: { + owner_name: "", + credentials: [] + }, + contact_information: { + address: "659 Shell Drive, Spring Lake, NC 28390", + phone: "910-638-7836" + }, + reputation: { + client_feedback: "", + operational_quality: "" + } + }; + + const finalResult = await mixSchemaObjects(originalSchema, singleAnswerResult, undefined); + + expect(finalResult).toEqual(singleAnswerResult); + }); }); diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 1387ec19a8..7b4a5a1435 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -374,7 +374,7 @@ export async function performExtraction( ], }); - const completionPromise = batchExtractPromise(multiEntitySchema, links, request.prompt ?? "", request.systemPrompt ?? "", doc); + const completionPromise = batchExtractPromise(multiEntitySchema, links, request.prompt ?? "", request.systemPrompt ?? "", doc); // Race between timeout and completion const multiEntityCompletion = (await Promise.race([ @@ -563,7 +563,7 @@ export async function performExtraction( // Generate completions logger.debug("Generating singleAnswer completions..."); - let { extract: singleAnswerResult, tokenUsage: singleAnswerTokenUsage } = await singleAnswerCompletion({ + let { extract: completionResult, tokenUsage: singleAnswerTokenUsage } = await singleAnswerCompletion({ singleAnswerDocs, rSchema, links, @@ -576,10 +576,10 @@ export async function performExtraction( if (singleAnswerTokenUsage) { tokenUsage.push(singleAnswerTokenUsage); } - - singleAnswerResult = singleAnswerResult.extract; + singleAnswerResult = completionResult; singleAnswerCompletions = singleAnswerResult; + // Update token usage in traces // if (completions && completions.numTokens) { // const totalLength = docs.reduce( From c998e3c9e583a7b154c65cd37f10f1109ca151a5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 26 Jan 2025 22:45:22 -0300 Subject: [PATCH 07/11] NIck: --- apps/api/src/lib/__tests__/mix-schemas.test.ts | 4 ++-- apps/api/src/lib/extract/extraction-service.ts | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/__tests__/mix-schemas.test.ts b/apps/api/src/lib/__tests__/mix-schemas.test.ts index 5154bb1cba..4bb23e0889 100644 --- a/apps/api/src/lib/__tests__/mix-schemas.test.ts +++ b/apps/api/src/lib/__tests__/mix-schemas.test.ts @@ -1099,7 +1099,7 @@ describe("mixSchemaObjects function", () => { const finalResult = await mixSchemaObjects( originalSchema, singleAnswerResult, - undefined + {} ); expect(finalResult).toEqual(singleAnswerResult); @@ -1175,7 +1175,7 @@ describe("mixSchemaObjects function", () => { } }; - const finalResult = await mixSchemaObjects(originalSchema, singleAnswerResult, undefined); + const finalResult = await mixSchemaObjects(originalSchema, singleAnswerResult, {}); expect(finalResult).toEqual(singleAnswerResult); }); diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 7b4a5a1435..9c2f42018f 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -573,13 +573,13 @@ export async function performExtraction( logger.debug("Done generating singleAnswer completions."); // Track single answer extraction tokens - if (singleAnswerTokenUsage) { + if (completionResult) { tokenUsage.push(singleAnswerTokenUsage); } + singleAnswerResult = completionResult; singleAnswerCompletions = singleAnswerResult; - // Update token usage in traces // if (completions && completions.numTokens) { // const totalLength = docs.reduce( From 7b2d72bde66867692190d6578c17161f46a43117 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 27 Jan 2025 16:05:57 -0300 Subject: [PATCH 08/11] Nick: wip --- .../lib/extract/completions/batchExtract.ts | 19 +++++++-- .../lib/extract/completions/singleAnswer.ts | 15 +++++-- .../api/src/lib/extract/extraction-service.ts | 40 +++++++++++++++++-- apps/api/src/types.ts | 1 + 4 files changed, 64 insertions(+), 11 deletions(-) diff --git a/apps/api/src/lib/extract/completions/batchExtract.ts b/apps/api/src/lib/extract/completions/batchExtract.ts index afed4a3a3e..a7e14e135d 100644 --- a/apps/api/src/lib/extract/completions/batchExtract.ts +++ b/apps/api/src/lib/extract/completions/batchExtract.ts @@ -17,14 +17,20 @@ import { * @param doc - The document to extract information from * @returns The completion promise */ -export function batchExtractPromise( +export async function batchExtractPromise( multiEntitySchema: any, links: string[], prompt: string, systemPrompt: string, doc: Document, -): Promise>> { - const completionPromise = generateOpenAICompletions( +): Promise<{ + extract: any; + numTokens: number; + totalUsage: TokenUsage; + warning?: string; + sources: string[]; +}> { + const completion = await generateOpenAICompletions( logger.child({ method: "extractService/generateOpenAICompletions", }), @@ -43,5 +49,10 @@ export function batchExtractPromise( true, ); - return completionPromise; + return { + extract: completion.extract, + numTokens: completion.numTokens, + totalUsage: completion.totalUsage, + sources: [doc.metadata.url || doc.metadata.sourceURL || ""] + }; } diff --git a/apps/api/src/lib/extract/completions/singleAnswer.ts b/apps/api/src/lib/extract/completions/singleAnswer.ts index ac36b034c3..0f32d8c64a 100644 --- a/apps/api/src/lib/extract/completions/singleAnswer.ts +++ b/apps/api/src/lib/extract/completions/singleAnswer.ts @@ -1,7 +1,8 @@ import { logger } from "../../../lib/logger"; import { generateOpenAICompletions } from "../../../scraper/scrapeURL/transformers/llmExtract"; import { buildDocument } from "../build-document"; -import { Document } from "../../../controllers/v1/types"; +import { Document, TokenUsage } from "../../../controllers/v1/types"; + export async function singleAnswerCompletion({ singleAnswerDocs, rSchema, @@ -14,7 +15,11 @@ export async function singleAnswerCompletion({ links: string[]; prompt: string; systemPrompt: string; -}) { +}): Promise<{ + extract: any; + tokenUsage: TokenUsage; + sources: string[]; +}> { const completion = await generateOpenAICompletions( logger.child({ module: "extract", method: "generateOpenAICompletions" }), { @@ -30,5 +35,9 @@ export async function singleAnswerCompletion({ undefined, true, ); - return { extract: completion.extract, tokenUsage: completion.totalUsage }; + return { + extract: completion.extract, + tokenUsage: completion.totalUsage, + sources: singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || "") + }; } diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 9c2f42018f..8d57b251e5 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -52,6 +52,7 @@ interface ExtractResult { tokenUsageBreakdown?: TokenUsage[]; llmUsage?: number; totalUrlsScraped?: number; + sources?: Record; } type completions = { @@ -59,6 +60,7 @@ type completions = { numTokens: number; totalUsage: TokenUsage; warning?: string; + sources?: string[]; }; @@ -74,6 +76,7 @@ export async function performExtraction( let multiEntityResult: any = {}; let singleAnswerResult: any = {}; let totalUrlsScraped = 0; + let sources: Record = {}; const logger = _logger.child({ module: "extract", @@ -385,6 +388,23 @@ export async function performExtraction( // Track multi-entity extraction tokens if (multiEntityCompletion) { tokenUsage.push(multiEntityCompletion.totalUsage); + + // Track sources for multi-entity items + if (multiEntityCompletion.extract) { + // For each multi-entity key, track the source URL + multiEntityKeys.forEach(key => { + const items = multiEntityCompletion.extract[key]; + if (Array.isArray(items)) { + items.forEach((item, index) => { + const sourcePath = `${key}[${index}]`; + if (!sources[sourcePath]) { + sources[sourcePath] = []; + } + sources[sourcePath].push(doc.metadata.url || doc.metadata.sourceURL || ""); + }); + } + }); + } } // console.log(multiEntityCompletion.extract) @@ -563,7 +583,7 @@ export async function performExtraction( // Generate completions logger.debug("Generating singleAnswer completions..."); - let { extract: completionResult, tokenUsage: singleAnswerTokenUsage } = await singleAnswerCompletion({ + let { extract: completionResult, tokenUsage: singleAnswerTokenUsage, sources: singleAnswerSources } = await singleAnswerCompletion({ singleAnswerDocs, rSchema, links, @@ -572,9 +592,18 @@ export async function performExtraction( }); logger.debug("Done generating singleAnswer completions."); - // Track single answer extraction tokens + // Track single answer extraction tokens and sources if (completionResult) { tokenUsage.push(singleAnswerTokenUsage); + + // Add sources for top-level properties in single answer + if (rSchema?.properties) { + Object.keys(rSchema.properties).forEach(key => { + if (completionResult[key] !== undefined) { + sources[key] = singleAnswerSources || singleAnswerDocs.map(doc => doc.metadata.url || doc.metadata.sourceURL || ""); + } + }); + } } singleAnswerResult = completionResult; @@ -674,7 +703,7 @@ export async function performExtraction( ); }); - // Log job with token usage + // Log job with token usage and sources logJob({ job_id: extractId, success: true, @@ -689,10 +718,12 @@ export async function performExtraction( origin: request.origin ?? "api", num_tokens: totalTokensUsed, tokens_billed: tokensToBill, + sources, }).then(() => { updateExtract(extractId, { status: "completed", llmUsage, + sources, }).catch((error) => { logger.error( `Failed to update extract ${extractId} status to completed: ${error}`, @@ -706,9 +737,10 @@ export async function performExtraction( success: true, data: finalResult ?? {}, extractId, - warning: undefined, // TODO FIX + warning: undefined, urlTrace: request.urlTrace ? urlTraces : undefined, llmUsage, totalUrlsScraped, + sources, }; } diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index a894c32933..24dbc437c3 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -88,6 +88,7 @@ export interface FirecrawlJob { retry?: boolean; crawl_id?: string; tokens_billed?: number; + sources?: Record; } export interface FirecrawlScrapeResponse { From a2aa104b959f9ae43f2313910f1117d6b7849675 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 27 Jan 2025 20:03:42 -0300 Subject: [PATCH 09/11] Nick: reverted to the old re-ranker --- apps/api/src/controllers/v1/extract-status.ts | 2 +- apps/api/src/lib/extract/build-prompts.ts | 2 +- .../api/src/lib/extract/extraction-service.ts | 2 +- apps/api/src/lib/extract/url-processor.ts | 67 ++++++------------- 4 files changed, 25 insertions(+), 48 deletions(-) diff --git a/apps/api/src/controllers/v1/extract-status.ts b/apps/api/src/controllers/v1/extract-status.ts index da3c08f137..0f51f4fb00 100644 --- a/apps/api/src/controllers/v1/extract-status.ts +++ b/apps/api/src/controllers/v1/extract-status.ts @@ -39,6 +39,6 @@ export async function extractStatusController( expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(), steps: extract.showSteps ? extract.steps : undefined, llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined, - sources: extract.sources, + // sources: extract.sources, }); } diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index ebd6c7fea1..05f5bf6fb3 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -40,7 +40,7 @@ to determine their relevance to the user's query and intent. } export function buildRerankerUserPrompt(searchQuery: string): string { - return `Given these URLs, rank which ones are relevant to the user's extraction intent: "${searchQuery}".`; + return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`; } // Multi entity schema anlayzer diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 8d57b251e5..960c19e038 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -741,6 +741,6 @@ export async function performExtraction( urlTrace: request.urlTrace ? urlTraces : undefined, llmUsage, totalUrlsScraped, - sources, + // sources, }; } diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index cb09feb9ff..4949df9b2d 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -68,7 +68,7 @@ export async function processUrl( try { logger.debug("Running map...", { search: searchQuery, - }) + }); const mapResults = await getMapResults({ url: baseUrl, search: searchQuery, @@ -200,67 +200,44 @@ export async function processUrl( // ); logger.info("Generated rephrased prompt.", { - rephrasedPrompt + rephrasedPrompt, }); - let rerankedLinks = mappedLinks; logger.info("Reranking pass 1 (threshold 0.8)..."); const rerankerResult = await rerankLinksWithLLM({ - links: rerankedLinks, + links: mappedLinks, searchQuery: rephrasedPrompt, - urlTraces + urlTraces, }); - rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.8); + mappedLinks = rerankerResult.mapDocument; let tokensUsed = rerankerResult.tokensUsed; - - logger.info("Reranked! (threshold 0.8)", { - linkCount: rerankedLinks.length, + logger.info("Reranked! (pass 1)", { + linkCount: mappedLinks.length, }); - // lower threshold to 0.6 if no links are found - if (rerankedLinks.length === 0) { - logger.info("No links found. Reranking with threshold 0.6"); - rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6); - logger.info("Reranked! (threshold 0.6)", { - linkCount: rerankedLinks.length, - }); - } - - // lower threshold to 0.3 if no links are found - if (rerankedLinks.length === 0) { - logger.info("No links found. Reranking with threshold 0.3"); - rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.3); - logger.info("Reranked! (threshold 0.3)", { - linkCount: rerankedLinks.length, - }); - } - // 2nd Pass, useful for when the first pass returns too many links - if (rerankedLinks.length > 100) { - logger.info("Reranking pass 2 (> 100 links - threshold 0.6)..."); - const secondPassRerankerResult = await rerankLinksWithLLM({ - links: rerankedLinks, + if (mappedLinks.length > 100) { + logger.info("Reranking (pass 2)..."); + const rerankerResult = await rerankLinksWithLLM({ + links: mappedLinks, searchQuery: rephrasedPrompt, urlTraces, }); - - // why 0.6? average? experimental results? - if (secondPassRerankerResult.mapDocument.length > 0) { - rerankedLinks = secondPassRerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6); - logger.info("Reranked! (threshold 0.6)", { - linkCount: rerankedLinks.length, - }); - } - } - - // If no relevant links are found, return the original mapped links - if (rerankedLinks.length === 0) { - logger.info("No links found. Not reranking."); - rerankedLinks = mappedLinks; + mappedLinks = rerankerResult.mapDocument; + tokensUsed += rerankerResult.tokensUsed; + logger.info("Reranked! (pass 2)", { + linkCount: mappedLinks.length, + }); } + // dumpToFile( + // "llm-links.txt", + // mappedLinks, + // (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}` + // ); // Remove title and description from mappedLinks mappedLinks = mappedLinks.map((link) => ({ url: link.url })); + console.log("mappedLinks: ", mappedLinks); return mappedLinks.map((x) => x.url); } catch (error) { trace.status = "error"; From 9343b99b87406c923f68afe2c8d49fccfa0808fe Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 27 Jan 2025 20:05:33 -0300 Subject: [PATCH 10/11] Nick: --- apps/api/src/lib/extract/extraction-service.ts | 2 +- apps/api/src/lib/extract/url-processor.ts | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 960c19e038..9d1dd453a7 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -340,7 +340,7 @@ export async function performExtraction( tokenUsage.push(shouldExtractCheckTokenUsage); if (!extract) { - console.log( + logger.info( `Skipping extraction for ${doc.metadata.url} as content is irrelevant`, ); return null; diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index 4949df9b2d..6525de881e 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -237,7 +237,6 @@ export async function processUrl( // ); // Remove title and description from mappedLinks mappedLinks = mappedLinks.map((link) => ({ url: link.url })); - console.log("mappedLinks: ", mappedLinks); return mappedLinks.map((x) => x.url); } catch (error) { trace.status = "error"; From 645018e1b53da4f95caf597c5dec3b9b7b5de603 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 27 Jan 2025 20:06:07 -0300 Subject: [PATCH 11/11] Update extract-status.ts --- apps/api/src/controllers/v1/extract-status.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/extract-status.ts b/apps/api/src/controllers/v1/extract-status.ts index 0f51f4fb00..20b87f684f 100644 --- a/apps/api/src/controllers/v1/extract-status.ts +++ b/apps/api/src/controllers/v1/extract-status.ts @@ -30,7 +30,7 @@ export async function extractStatusController( data = jobData[0].docs; } - console.log(extract.sources); + // console.log(extract.sources); return res.status(200).json({ success: extract.status === "failed" ? false : true, data: data,