From 64d116540fab20676dda18de5dedffa4251b6ed3 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 24 Jan 2025 09:08:16 -0300 Subject: [PATCH 1/6] rerank with lower threshold + back to map if lenght = 0 --- apps/api/src/lib/extract/build-prompts.ts | 2 +- apps/api/src/lib/extract/reranker.ts | 35 ++++++++----- apps/api/src/lib/extract/url-processor.ts | 62 ++++++++++++++--------- 3 files changed, 61 insertions(+), 38 deletions(-) diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index 8996c13d54..51ff447d3b 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -35,5 +35,5 @@ export function buildRerankerSystemPrompt(): string { } export function buildRerankerUserPrompt(searchQuery: string): string { - return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`; + return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction.`; } diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index dc23d4cbf1..eec5dc9292 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -158,24 +158,27 @@ function filterAndProcessLinks( } export type RerankerResult = { - mapDocument: MapDocument[]; + mapDocument: (MapDocument & { relevanceScore?: number })[]; tokensUsed: number; }; -export async function rerankLinksWithLLM( - mappedLinks: MapDocument[], - searchQuery: string, - urlTraces: URLTrace[], -): Promise { +export type RerankerOptions = { + links: MapDocument[]; + searchQuery: string; + urlTraces: URLTrace[]; +}; + +export async function rerankLinksWithLLM(options: RerankerOptions): Promise { + const { links, searchQuery, urlTraces } = options; const chunkSize = 100; const chunks: MapDocument[][] = []; const TIMEOUT_MS = 20000; const MAX_RETRIES = 2; let totalTokensUsed = 0; - // Split mappedLinks into chunks of 200 - for (let i = 0; i < mappedLinks.length; i += chunkSize) { - chunks.push(mappedLinks.slice(i, i + chunkSize)); + // Split links into chunks of 200 + for (let i = 0; i < links.length; i += chunkSize) { + chunks.push(links.slice(i, i + chunkSize)); } // console.log(`Total links: ${mappedLinks.length}, Number of chunks: ${chunks.length}`); @@ -190,8 +193,9 @@ export async function rerankLinksWithLLM( properties: { url: { type: "string" }, relevanceScore: { type: "number" }, + reason: { type: "string" }, }, - required: ["url", "relevanceScore"], + required: ["url", "relevanceScore", "reason"], }, }, }, @@ -275,10 +279,15 @@ export async function rerankLinksWithLLM( // Map back to MapDocument format, keeping only relevant links const relevantLinks = flattenedResults - .map((result) => mappedLinks.find((link) => link.url === result.url)) - .filter((link): link is MapDocument => link !== undefined); + .map((result) => { + const link = links.find((link) => link.url === result.url); + if (link) { + return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0 }; + } + return undefined; + }) + .filter((link): link is NonNullable => link !== undefined); - // console.log(`Returning ${relevantLinks.length} relevant links`); return { mapDocument: relevantLinks, tokensUsed: totalTokensUsed, diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index ab9f6f60ab..7a265f365f 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -203,38 +203,52 @@ export async function processUrl( rephrasedPrompt }); - logger.info("Reranking (pass 1)..."); - const rerankerResult = await rerankLinksWithLLM( - mappedLinks, - rephrasedPrompt, - urlTraces, - ); - mappedLinks = rerankerResult.mapDocument; + let rerankedLinks = mappedLinks; + logger.info("Reranking pass 1 (threshold 0.6)..."); + const rerankerResult = await rerankLinksWithLLM({ + links: rerankedLinks, + searchQuery: rephrasedPrompt, + urlTraces + }); + rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6); let tokensUsed = rerankerResult.tokensUsed; - logger.info("Reranked! (pass 1)", { - linkCount: mappedLinks.length, + + logger.info("Reranked! (threshold 0.6)", { + linkCount: rerankedLinks.length, }); + // lower threshold to 0.3 if no links are found + if (rerankedLinks.length === 0) { + logger.info("No links found. Reranking with threshold 0.3"); + rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.3); + logger.info("Reranked! (threshold 0.3)", { + linkCount: rerankedLinks.length, + }); + } + // 2nd Pass, useful for when the first pass returns too many links - if (mappedLinks.length > 100) { - logger.info("Reranking (pass 2)..."); - const rerankerResult = await rerankLinksWithLLM( - mappedLinks, - rephrasedPrompt, + if (rerankedLinks.length > 100) { + logger.info("Reranking pass 2 (> 100 links - threshold 0.6)..."); + const secondPassRerankerResult = await rerankLinksWithLLM({ + links: rerankedLinks, + searchQuery: rephrasedPrompt, urlTraces, - ); - mappedLinks = rerankerResult.mapDocument; - tokensUsed += rerankerResult.tokensUsed; - logger.info("Reranked! (pass 2)", { - linkCount: mappedLinks.length, }); + + if (secondPassRerankerResult.mapDocument.length > 0) { + rerankedLinks = secondPassRerankerResult.mapDocument; + logger.info("Reranked! (threshold 0.6)", { + linkCount: rerankedLinks.length, + }); + } + } + + // If no relevant links are found, return the original mapped links + if (rerankedLinks.length === 0) { + logger.info("No links found. Not reranking."); + rerankedLinks = mappedLinks; } - // dumpToFile( - // "llm-links.txt", - // mappedLinks, - // (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}` - // ); // Remove title and description from mappedLinks mappedLinks = mappedLinks.map((link) => ({ url: link.url })); return mappedLinks.map((x) => x.url); From 3184e91f667ed480d41e9912e5e91074367e32ea Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 24 Jan 2025 10:25:45 -0300 Subject: [PATCH 2/6] layers --- apps/api/src/lib/extract/url-processor.ts | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index 7a265f365f..cb09feb9ff 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -204,19 +204,28 @@ export async function processUrl( }); let rerankedLinks = mappedLinks; - logger.info("Reranking pass 1 (threshold 0.6)..."); + logger.info("Reranking pass 1 (threshold 0.8)..."); const rerankerResult = await rerankLinksWithLLM({ links: rerankedLinks, searchQuery: rephrasedPrompt, urlTraces }); - rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6); + rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.8); let tokensUsed = rerankerResult.tokensUsed; - logger.info("Reranked! (threshold 0.6)", { + logger.info("Reranked! (threshold 0.8)", { linkCount: rerankedLinks.length, }); + // lower threshold to 0.6 if no links are found + if (rerankedLinks.length === 0) { + logger.info("No links found. Reranking with threshold 0.6"); + rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6); + logger.info("Reranked! (threshold 0.6)", { + linkCount: rerankedLinks.length, + }); + } + // lower threshold to 0.3 if no links are found if (rerankedLinks.length === 0) { logger.info("No links found. Reranking with threshold 0.3"); @@ -235,8 +244,9 @@ export async function processUrl( urlTraces, }); + // why 0.6? average? experimental results? if (secondPassRerankerResult.mapDocument.length > 0) { - rerankedLinks = secondPassRerankerResult.mapDocument; + rerankedLinks = secondPassRerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6); logger.info("Reranked! (threshold 0.6)", { linkCount: rerankedLinks.length, }); From d547192f374b84aca2af90ffceec1ba95856db66 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 24 Jan 2025 17:55:16 -0300 Subject: [PATCH 3/6] Nick: fixed spread schemas --- .../__tests__/spread-schema-objects.test.ts | 570 ++++++++++++++++++ .../src/lib/extract/helpers/spread-schemas.ts | 45 +- 2 files changed, 611 insertions(+), 4 deletions(-) diff --git a/apps/api/src/lib/__tests__/spread-schema-objects.test.ts b/apps/api/src/lib/__tests__/spread-schema-objects.test.ts index 7cfcdd03da..f49fb378d8 100644 --- a/apps/api/src/lib/__tests__/spread-schema-objects.test.ts +++ b/apps/api/src/lib/__tests__/spread-schema-objects.test.ts @@ -283,4 +283,574 @@ describe("spreadSchemas", () => { expect(singleAnswerSchema).toEqual({}); expect(multiEntitySchema).toEqual(schema); }); + + it("should spread pages schema", async () => { + const schema = { + type: "object", + properties: { + pages: { + type: "array", + items: { + type: "object", + properties: { + title: { + type: "string", + }, + }, + }, + }, + }, + required: ["pages"], + }; + + const keys = ["pages"]; + const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas( + schema, + keys, + ); + + expect(singleAnswerSchema).toEqual({}); + expect(multiEntitySchema).toEqual(schema); + }); + + it("should spread pages schema", async () => { + const schema = { + type: "object", + properties: { + pages: { + type: "array", + items: { + type: "object", + properties: { + title: { + type: "string", + }, + }, + }, + }, + }, + required: ["pages"], + }; + + const keys = ["pages.title"]; + const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas( + schema, + keys, + ); + + expect(singleAnswerSchema).toEqual({}); + expect(multiEntitySchema).toEqual(schema); + }); + + it("should handle deeply nested array properties", async () => { + const schema = { + type: "object", + properties: { + company: { + type: "object", + properties: { + name: { type: "string" }, + departments: { + type: "array", + items: { + type: "object", + properties: { + name: { type: "string" }, + employees: { + type: "array", + items: { + type: "object", + properties: { + name: { type: "string" }, + role: { type: "string" }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + required: ["company"], + }; + + const keys = ["company.departments.employees"]; + const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas( + schema, + keys, + ); + + expect(singleAnswerSchema).toEqual({}); + expect(multiEntitySchema).toEqual(schema); + }); + + it("should handle multiple nested paths", async () => { + const schema = { + type: "object", + properties: { + user: { + type: "object", + properties: { + name: { type: "string" }, + contacts: { + type: "array", + items: { + type: "object", + properties: { + email: { type: "string" }, + phone: { type: "string" }, + }, + }, + }, + }, + }, + orders: { + type: "array", + items: { + type: "object", + properties: { + id: { type: "string" }, + items: { + type: "array", + items: { + type: "object", + properties: { + name: { type: "string" }, + quantity: { type: "number" }, + }, + }, + }, + }, + }, + }, + }, + required: ["user", "orders"], + }; + + const keys = ["user.contacts", "orders.items"]; + const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas( + schema, + keys, + ); + + expect(singleAnswerSchema).toEqual({}); + expect(multiEntitySchema).toEqual(schema); + }); + + it("should handle mixed single and array properties", async () => { + const schema = { + type: "object", + properties: { + metadata: { + type: "object", + properties: { + title: { type: "string" }, + description: { type: "string" }, + }, + }, + sections: { + type: "array", + items: { + type: "object", + properties: { + title: { type: "string" }, + content: { type: "string" }, + }, + }, + }, + }, + required: ["metadata", "sections"], + }; + + const keys = ["sections"]; + const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas( + schema, + keys, + ); + + expect(singleAnswerSchema).toEqual({ + type: "object", + properties: { + metadata: { + type: "object", + properties: { + title: { type: "string" }, + description: { type: "string" }, + }, + }, + }, + required: ["metadata"], + }); + + expect(multiEntitySchema).toEqual({ + type: "object", + properties: { + sections: { + type: "array", + items: { + type: "object", + properties: { + title: { type: "string" }, + content: { type: "string" }, + }, + }, + }, + }, + required: ["sections"], + }); + }); + + it("should handle empty keys array", async () => { + const schema = { + type: "object", + properties: { + name: { type: "string" }, + age: { type: "number" }, + }, + required: ["name"], + }; + + const keys: string[] = []; + const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas( + schema, + keys, + ); + + expect(singleAnswerSchema).toEqual(schema); + expect(multiEntitySchema).toEqual({}); + }); + + it("should handle non-existent paths", async () => { + const schema = { + type: "object", + properties: { + user: { + type: "object", + properties: { + name: { type: "string" }, + }, + }, + }, + }; + + const keys = ["user.nonexistent.path"]; + const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas( + schema, + keys, + ); + + expect(singleAnswerSchema).toEqual({}); + expect(multiEntitySchema).toEqual(schema); + }); + + // it("should split nested object and array properties", async () => { + // const schema = { + // type: "object", + // properties: { + // company: { + // type: "object", + // properties: { + // name: { type: "string" }, + // address: { + // type: "object", + // properties: { + // street: { type: "string" }, + // city: { type: "string" }, + // }, + // }, + // employees: { + // type: "array", + // items: { + // type: "object", + // properties: { + // name: { type: "string" }, + // position: { type: "string" }, + // }, + // }, + // }, + // }, + // }, + // }, + // required: ["company"], + // }; + + // const keys = ["company.employees"]; + // const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas( + // schema, + // keys, + // ); + + // expect(singleAnswerSchema).toEqual({ + // type: "object", + // properties: { + // company: { + // type: "object", + // properties: { + // name: { type: "string" }, + // address: { + // type: "object", + // properties: { + // street: { type: "string" }, + // city: { type: "string" }, + // }, + // }, + // }, + // }, + // }, + // required: ["company"], + // }); + + // expect(multiEntitySchema).toEqual({ + // type: "object", + // properties: { + // company: { + // type: "object", + // properties: { + // employees: { + // type: "array", + // items: { + // type: "object", + // properties: { + // name: { type: "string" }, + // position: { type: "string" }, + // }, + // }, + // }, + // }, + // }, + // }, + // required: ["company"], + // }); + // }); + + // it("should handle multiple root level properties with nested paths", async () => { + // const schema = { + // type: "object", + // properties: { + // user: { + // type: "object", + // properties: { + // id: { type: "string" }, + // profile: { + // type: "object", + // properties: { + // name: { type: "string" }, + // email: { type: "string" }, + // }, + // }, + // posts: { + // type: "array", + // items: { + // type: "object", + // properties: { + // title: { type: "string" }, + // content: { type: "string" }, + // }, + // }, + // }, + // }, + // }, + // settings: { + // type: "object", + // properties: { + // theme: { type: "string" }, + // notifications: { + // type: "object", + // properties: { + // email: { type: "boolean" }, + // push: { type: "boolean" }, + // }, + // }, + // }, + // }, + // }, + // required: ["user", "settings"], + // }; + + // const keys = ["user.posts", "settings.notifications"]; + // const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas( + // schema, + // keys, + // ); + + // expect(singleAnswerSchema).toEqual({ + // type: "object", + // properties: { + // user: { + // type: "object", + // properties: { + // id: { type: "string" }, + // profile: { + // type: "object", + // properties: { + // name: { type: "string" }, + // email: { type: "string" }, + // }, + // }, + // }, + // }, + // settings: { + // type: "object", + // properties: { + // theme: { type: "string" }, + // }, + // }, + // }, + // required: ["user", "settings"], + // }); + + // expect(multiEntitySchema).toEqual({ + // type: "object", + // properties: { + // user: { + // type: "object", + // properties: { + // posts: { + // type: "array", + // items: { + // type: "object", + // properties: { + // title: { type: "string" }, + // content: { type: "string" }, + // }, + // }, + // }, + // }, + // }, + // settings: { + // type: "object", + // properties: { + // notifications: { + // type: "object", + // properties: { + // email: { type: "boolean" }, + // push: { type: "boolean" }, + // }, + // }, + // }, + // }, + // }, + // required: ["user", "settings"], + // }); + // }); + + // it("should handle array properties at different nesting levels", async () => { + // const schema = { + // type: "object", + // properties: { + // categories: { + // type: "array", + // items: { + // type: "object", + // properties: { + // name: { type: "string" }, + // subcategories: { + // type: "array", + // items: { + // type: "object", + // properties: { + // name: { type: "string" }, + // products: { + // type: "array", + // items: { + // type: "object", + // properties: { + // name: { type: "string" }, + // price: { type: "number" }, + // }, + // }, + // }, + // }, + // }, + // }, + // }, + // }, + // }, + // featured: { + // type: "object", + // properties: { + // category: { type: "string" }, + // items: { + // type: "array", + // items: { + // type: "object", + // properties: { + // id: { type: "string" }, + // name: { type: "string" }, + // }, + // }, + // }, + // }, + // }, + // }, + // }; + + // const keys = ["categories.subcategories", "featured.items"]; + // const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas( + // schema, + // keys, + // ); + + // expect(singleAnswerSchema).toEqual({ + // type: "object", + // properties: { + // featured: { + // type: "object", + // properties: { + // category: { type: "string" }, + // }, + // }, + // }, + // }); + + // expect(multiEntitySchema).toEqual({ + // type: "object", + // properties: { + // categories: { + // type: "array", + // items: { + // type: "object", + // properties: { + // name: { type: "string" }, + // subcategories: { + // type: "array", + // items: { + // type: "object", + // properties: { + // name: { type: "string" }, + // products: { + // type: "array", + // items: { + // type: "object", + // properties: { + // name: { type: "string" }, + // price: { type: "number" }, + // }, + // }, + // }, + // }, + // }, + // }, + // }, + // }, + // }, + // featured: { + // type: "object", + // properties: { + // items: { + // type: "array", + // items: { + // type: "object", + // properties: { + // id: { type: "string" }, + // name: { type: "string" }, + // }, + // }, + // }, + // }, + // }, + // }, + // }); + // }); }); diff --git a/apps/api/src/lib/extract/helpers/spread-schemas.ts b/apps/api/src/lib/extract/helpers/spread-schemas.ts index 8ba9f42880..c4026238b0 100644 --- a/apps/api/src/lib/extract/helpers/spread-schemas.ts +++ b/apps/api/src/lib/extract/helpers/spread-schemas.ts @@ -1,3 +1,5 @@ +import { logger } from "../../../lib/logger"; + export async function spreadSchemas( schema: any, keys: string[], @@ -6,14 +8,45 @@ export async function spreadSchemas( multiEntitySchema: any; }> { let singleAnswerSchema = { ...schema, properties: { ...schema.properties } }; - let multiEntitySchema: any = { type: "object", properties: {} }; + let multiEntitySchema: any = { + type: "object", + properties: {}, + ...(schema.required ? { required: [] } : {}) + }; + + // Helper function to check if a property path exists in schema + const hasPropertyPath = (schema: any, path: string[]): boolean => { + let current = schema.properties; + for (let i = 0; i < path.length; i++) { + if (!current[path[i]]) return false; + if (current[path[i]].type === "array" && current[path[i]].items) { + current = current[path[i]].items.properties; + } else { + current = current[path[i]].properties; + } + } + return true; + }; + + // Helper function to get the root property of a dot path + const getRootProperty = (path: string): string => { + return path.split('.')[0]; + }; keys.forEach((key) => { - if (singleAnswerSchema.properties[key]) { - multiEntitySchema.properties[key] = singleAnswerSchema.properties[key]; - delete singleAnswerSchema.properties[key]; + const rootProperty = getRootProperty(key); + if (singleAnswerSchema.properties[rootProperty]) { + multiEntitySchema.properties[rootProperty] = singleAnswerSchema.properties[rootProperty]; + delete singleAnswerSchema.properties[rootProperty]; + + // Move required field if it exists + if (schema.required?.includes(rootProperty)) { + multiEntitySchema.required.push(rootProperty); + singleAnswerSchema.required = schema.required.filter((k: string) => k !== rootProperty); + } } }); + // Recursively delete empty properties in singleAnswerSchema const deleteEmptyProperties = (schema: any) => { for (const key in schema.properties) { @@ -34,10 +67,14 @@ export async function spreadSchemas( // If singleAnswerSchema has no properties left, return an empty object if (Object.keys(singleAnswerSchema.properties).length === 0) { singleAnswerSchema = {}; + } else if (singleAnswerSchema.required?.length === 0) { + delete singleAnswerSchema.required; } if (Object.keys(multiEntitySchema.properties).length === 0) { multiEntitySchema = {}; + } else if (multiEntitySchema.required?.length === 0) { + delete multiEntitySchema.required; } return { From 2c391b01051e564ae1cc3fa6c547ddfecda99097 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 24 Jan 2025 18:09:25 -0300 Subject: [PATCH 4/6] Nick: --- apps/api/src/lib/extract/build-prompts.ts | 10 ++++++++-- apps/api/src/lib/extract/reranker.ts | 12 +++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index 51ff447d3b..450e29f558 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -31,9 +31,15 @@ Return only a concise sentece or 2 focused on the essential data points that the } export function buildRerankerSystemPrompt(): string { - return "You are a relevance expert. Analyze the provided URLs and their content to determine their relevance to the user's query and intent. For each URL, assign a relevance score between 0 and 1, where 1 means highly relevant and 0 means not relevant at all. Only include URLs that are actually relevant to the query."; + return `You are a relevance expert scoring links from a website the user is trying to + extract information from. Analyze the provided URLs and their content + to determine their relevance to the user's query and intent. + For each URL, assign a relevance score between 0 and 1, where 1 + means highly relevant and we should extract the content from it and 0 means not relevant at all, we should not extract the content from it. + Always return all the links scored that you are giving. Do not omit links. + Always return the links in the same order they were provided. If the user wants the content from all the links, all links should be scored 1.`; } export function buildRerankerUserPrompt(searchQuery: string): string { - return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction.`; + return `Given these URLs, rank which ones are relevant to the user's extraction intent: "${searchQuery}".`; } diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index eec5dc9292..90d4ca21a1 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -8,6 +8,7 @@ import { searchSimilarPages } from "./index/pinecone"; import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; import { buildRerankerUserPrompt } from "./build-prompts"; import { buildRerankerSystemPrompt } from "./build-prompts"; +import { dumpToFile } from "./helpers/dump-to-file"; const cohere = new CohereClient({ token: process.env.COHERE_API_KEY, @@ -158,7 +159,7 @@ function filterAndProcessLinks( } export type RerankerResult = { - mapDocument: (MapDocument & { relevanceScore?: number })[]; + mapDocument: (MapDocument & { relevanceScore?: number; reason?: string })[]; tokensUsed: number; }; @@ -170,7 +171,7 @@ export type RerankerOptions = { export async function rerankLinksWithLLM(options: RerankerOptions): Promise { const { links, searchQuery, urlTraces } = options; - const chunkSize = 100; + const chunkSize = 20; const chunks: MapDocument[][] = []; const TIMEOUT_MS = 20000; const MAX_RETRIES = 2; @@ -193,7 +194,7 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise resolve(null), TIMEOUT_MS); }); + // dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent]) const completionPromise = generateOpenAICompletions( logger.child({ method: "rerankLinksWithLLM", @@ -233,7 +235,7 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise { const link = links.find((link) => link.url === result.url); if (link) { - return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0 }; + return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0, reason: result.reason }; } return undefined; }) From 10133adcc658de39ee5b73d6909e5f7e49829d54 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 24 Jan 2025 18:35:36 -0300 Subject: [PATCH 5/6] Update reranker.ts --- apps/api/src/lib/extract/reranker.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index 90d4ca21a1..100c860295 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -171,7 +171,7 @@ export type RerankerOptions = { export async function rerankLinksWithLLM(options: RerankerOptions): Promise { const { links, searchQuery, urlTraces } = options; - const chunkSize = 20; + const chunkSize = 100; const chunks: MapDocument[][] = []; const TIMEOUT_MS = 20000; const MAX_RETRIES = 2; From 4747c6f5699e2c95a21f8337fafccbacfacbd025 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 24 Jan 2025 19:19:18 -0300 Subject: [PATCH 6/6] Update build-prompts.ts --- apps/api/src/lib/extract/build-prompts.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index 450e29f558..ac5da8d80f 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -31,9 +31,8 @@ Return only a concise sentece or 2 focused on the essential data points that the } export function buildRerankerSystemPrompt(): string { - return `You are a relevance expert scoring links from a website the user is trying to - extract information from. Analyze the provided URLs and their content - to determine their relevance to the user's query and intent. + return `You are a relevance expert scoring links from a website the user is trying to extract information from. Analyze the provided URLs and their content +to determine their relevance to the user's query and intent. For each URL, assign a relevance score between 0 and 1, where 1 means highly relevant and we should extract the content from it and 0 means not relevant at all, we should not extract the content from it. Always return all the links scored that you are giving. Do not omit links.