Skip to content

Commit

Permalink
Nick: extract without a schema should work as expected
Browse files Browse the repository at this point in the history
  • Loading branch information
nickscamara committed Jan 14, 2025
1 parent 61e6af2 commit 957eea4
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 53 deletions.
2 changes: 0 additions & 2 deletions apps/api/src/lib/extract/extract-redis.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@ export enum ExtractStep {
MULTI_ENTITY_SCRAPE = "multi-entity-scrape",
MULTI_ENTITY_EXTRACT = "multi-entity-extract",
SCRAPE = "scrape",

EXTRACT = "extract",

COMPLETE = "complete",
}

Expand Down
145 changes: 94 additions & 51 deletions apps/api/src/lib/extract/extraction-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,16 @@ import { PlanType } from "../../types";
import { logger } from "../logger";
import { processUrl } from "./url-processor";
import { scrapeDocument } from "./document-scraper";
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
import {
generateOpenAICompletions,
generateSchemaFromPrompt,
} from "../../scraper/scrapeURL/transformers/llmExtract";
import { buildDocument } from "./build-document";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
import { saveCrawl, StoredCrawl } from "../crawl-redis";
import { dereferenceSchema } from "./helpers/dereference-schema";
import { dereferenceSchema } from "./helpers/dereference-schema";
import { z } from "zod";
import OpenAI from "openai";
import { spreadSchemas } from "./helpers/spread-schemas";
Expand Down Expand Up @@ -45,7 +48,6 @@ interface ExtractResult {
error?: string;
}


async function analyzeSchemaAndPrompt(
urls: string[],
schema: any,
Expand All @@ -56,6 +58,10 @@ async function analyzeSchemaAndPrompt(
reasoning?: string;
keyIndicators?: string[];
}> {
if (!schema) {
schema = await generateSchemaFromPrompt(prompt);
}

const schemaString = JSON.stringify(schema);

const checkSchema = z.object({
Expand Down Expand Up @@ -132,7 +138,7 @@ type completions = {
extract: Record<string, any>;
numTokens: number;
warning?: string;
}
};

function getRootDomain(url: string): string {
try {
Expand Down Expand Up @@ -186,20 +192,22 @@ export async function performExtraction(
includeSubdomains: request.includeSubdomains,
schema: request.schema,
},
urlTraces,
urlTraces,
(links: string[]) => {
aggMapLinks.push(...links);
updateExtract(extractId, {
steps: [
{
step: ExtractStep.MAP,
startedAt: startMap,
finishedAt: Date.now(),
discoveredLinks: aggMapLinks,
},
],
});
}));
{
step: ExtractStep.MAP,
startedAt: startMap,
finishedAt: Date.now(),
discoveredLinks: aggMapLinks,
},
],
});
},
),
);

const processedUrls = await Promise.all(urlPromises);
const links = processedUrls.flat().filter((url) => url);
Expand Down Expand Up @@ -227,7 +235,13 @@ export async function performExtraction(
});

let reqSchema = request.schema;
reqSchema = await dereferenceSchema(reqSchema);
if (!reqSchema && request.prompt) {
reqSchema = await generateSchemaFromPrompt(request.prompt);
}

if (reqSchema) {
reqSchema = await dereferenceSchema(reqSchema);
}

// agent evaluates if the schema or the prompt has an array with big amount of items
// also it checks if the schema any other properties that are not arrays
Expand All @@ -236,16 +250,19 @@ export async function performExtraction(
// 2. the second one is multiple completions that will extract the items from the array
let startAnalyze = Date.now();
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
await analyzeSchemaAndPrompt(links, request.schema, request.prompt ?? "");
await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");

// console.log("\nIs Multi Entity:", isMultiEntity);
// console.log("\nMulti Entity Keys:", multiEntityKeys);
// console.log("\nReasoning:", reasoning);
// console.log("\nKey Indicators:", keyIndicators);

let rSchema = reqSchema;
if (isMultiEntity) {
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(reqSchema, multiEntityKeys)
if (isMultiEntity && reqSchema) {
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
reqSchema,
multiEntityKeys,
);
rSchema = singleAnswerSchema;

await updateExtract(extractId, {
Expand All @@ -260,7 +277,6 @@ export async function performExtraction(
],
});


const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;

await updateExtract(extractId, {
Expand All @@ -287,11 +303,11 @@ export async function performExtraction(
timeout,
},
urlTraces,
)
);
}
return docsMap.get(url);
})
});

let multyEntityDocs = (await Promise.all(scrapePromises)).filter(
(doc): doc is Document => doc !== null,
);
Expand All @@ -315,7 +331,7 @@ export async function performExtraction(
docsMap.set(doc.metadata.url, doc);
}
}

// Process docs in chunks with queue style processing
const chunkSize = 50;
const timeoutCompletion = 45000; // 45 second timeout
Expand All @@ -331,7 +347,7 @@ export async function performExtraction(
const chunkPromises = chunk.map(async (doc) => {
try {
ajv.compile(multiEntitySchema);

// Wrap in timeout promise
const timeoutPromise = new Promise((resolve) => {
setTimeout(() => resolve(null), timeoutCompletion);
Expand All @@ -342,25 +358,28 @@ export async function performExtraction(
logger.child({ method: "extractService/checkShouldExtract" }),
{
mode: "llm",
systemPrompt: "You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.",
systemPrompt:
"You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.",
prompt: `Should the following content be used to extract information for this prompt: "${request.prompt}" User schema is: ${JSON.stringify(multiEntitySchema)}\nReturn only true or false.`,
schema: {
"type": "object",
"properties": {
"extract": {
"type": "boolean"
}
type: "object",
properties: {
extract: {
type: "boolean",
},
},
"required": ["extract"]
}
required: ["extract"],
},
},
buildDocument(doc),
undefined,
true
true,
);

if (!shouldExtractCheck.extract["extract"]) {
console.log(`Skipping extraction for ${doc.metadata.url} as content is irrelevant`);
console.log(
`Skipping extraction for ${doc.metadata.url} as content is irrelevant`,
);
return null;
}
// Add confidence score to schema with 5 levels
Expand All @@ -369,11 +388,15 @@ export async function performExtraction(
properties: {
...multiEntitySchema.properties,
is_content_relevant: {
type: "boolean",
description: "Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information."
}
type: "boolean",
description:
"Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information.",
},
},
required: [...(multiEntitySchema.required || []), "is_content_relevant"]
required: [
...(multiEntitySchema.required || []),
"is_content_relevant",
],
};
// console.log("schemaWithConfidence", schemaWithConfidence);

Expand All @@ -384,15 +407,19 @@ export async function performExtraction(
step: ExtractStep.MULTI_ENTITY_EXTRACT,
startedAt: startScrape,
finishedAt: Date.now(),
discoveredLinks: [doc.metadata.url || doc.metadata.sourceURL || ""],
discoveredLinks: [
doc.metadata.url || doc.metadata.sourceURL || "",
],
},
],
});

const completionPromise = generateOpenAICompletions(
logger.child({ method: "extractService/generateOpenAICompletions" }),
logger.child({
method: "extractService/generateOpenAICompletions",
}),
{
mode: "llm",
mode: "llm",
systemPrompt:
(request.systemPrompt ? `${request.systemPrompt}\n` : "") +
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` +
Expand All @@ -406,10 +433,10 @@ export async function performExtraction(
);

// Race between timeout and completion
const multiEntityCompletion = await Promise.race([
const multiEntityCompletion = (await Promise.race([
completionPromise,
timeoutPromise
]) as Awaited<ReturnType<typeof generateOpenAICompletions>>;
timeoutPromise,
])) as Awaited<ReturnType<typeof generateOpenAICompletions>>;

// console.log(multiEntityCompletion.extract)
// if (!multiEntityCompletion.extract?.is_content_relevant) {
Expand Down Expand Up @@ -452,25 +479,36 @@ export async function performExtraction(

// Wait for current chunk to complete before processing next chunk
const chunkResults = await Promise.all(chunkPromises);
multiEntityCompletions.push(...chunkResults.filter(result => result !== null));
multiEntityCompletions.push(
...chunkResults.filter((result) => result !== null),
);
}

try {
multiEntityResult = transformArrayToObject(multiEntitySchema, multiEntityCompletions);
multiEntityResult = transformArrayToObject(
multiEntitySchema,
multiEntityCompletions,
);
multiEntityResult = deduplicateObjectsArray(multiEntityResult);
multiEntityResult = mergeNullValObjs(multiEntityResult);
// @nick: maybe we can add here a llm that checks if the array probably has a primary key?
} catch (error) {
logger.error(`Failed to transform array to object: ${error}`);
return {
success: false,
error: "An unexpected error occurred. Please contact help@firecrawl.com for help.",
error:
"An unexpected error occurred. Please contact help@firecrawl.com for help.",
extractId,
urlTrace: urlTraces,
};
}
}
if (rSchema && Object.keys(rSchema).length > 0 && rSchema.properties && Object.keys(rSchema.properties).length > 0) {
if (
rSchema &&
Object.keys(rSchema).length > 0 &&
rSchema.properties &&
Object.keys(rSchema.properties).length > 0
) {
// Scrape documents
const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
let singleAnswerDocs: Document[] = [];
Expand Down Expand Up @@ -513,7 +551,9 @@ export async function performExtraction(
}
}

singleAnswerDocs.push(...results.filter((doc): doc is Document => doc !== null));
singleAnswerDocs.push(
...results.filter((doc): doc is Document => doc !== null),
);
} catch (error) {
return {
success: false,
Expand All @@ -527,7 +567,8 @@ export async function performExtraction(
// All urls are invalid
return {
success: false,
error: "All provided URLs are invalid. Please check your input and try again.",
error:
"All provided URLs are invalid. Please check your input and try again.",
extractId,
urlTrace: request.urlTrace ? urlTraces : undefined,
};
Expand Down Expand Up @@ -584,7 +625,9 @@ export async function performExtraction(
// }
}

const finalResult = await mixSchemaObjects(reqSchema, singleAnswerResult, multiEntityResult);
const finalResult = reqSchema
? await mixSchemaObjects(reqSchema, singleAnswerResult, multiEntityResult)
: singleAnswerResult || multiEntityResult;

let linksBilled = links.length * 5;

Expand Down
Loading

0 comments on commit 957eea4

Please sign in to comment.