diff --git a/docs/rag.md b/docs/rag.md index fb60fd7ee2..e8fc8c7b86 100644 --- a/docs/rag.md +++ b/docs/rag.md @@ -303,10 +303,39 @@ export const ragFlow = defineFlow( It's also possible to create your own retriever. This is useful if your documents are managed in a document store that is not supported in Genkit (eg: -MySQL, Google Drive, etc.). The Genkit SDK provides a flexible `defineRetriever` -method that lets you provide custom code for fetching documents. You can also -define custom retrievers that build on top of existing retrievers in Genkit and -apply advanced RAG techniques (such as reranking or prompt extensions) on top. +MySQL, Google Drive, etc.). The Genkit SDK provides flexible methods that let +you provide custom code for fetching documents. You can also define custom +retrievers that build on top of existing retrievers in Genkit and apply advanced +RAG techniques (such as reranking or prompt extensions) on top. + +### Simple Retrievers + +Simple retrievers let you easily convert existing code into retrievers: + +```javascript +import { + defineSimpleRetriever, + retrieve +} from '@genkit-ai/ai/retriever'; +import { searchEmails } from './db'; +import { z } from 'zod'; + +defineSimpleRetriever({ + name: 'myDatabase', + configSchema: z.object({ + limit: z.number().optional() + }).optional(), + // we'll extract "message" from the returned email item + content: 'message', + // and several keys to use as metadata + metadata: ['from', 'to', 'subject'], +} async (query, config) => { + const result = await searchEmails(query.text(), {limit: config.limit}); + return result.data.emails; +}); +``` + +### Custom Retrievers ```javascript import { diff --git a/js/ai/src/retriever.ts b/js/ai/src/retriever.ts index 1ea4d9a4e8..e3c6a89240 100644 --- a/js/ai/src/retriever.ts +++ b/js/ai/src/retriever.ts @@ -14,7 +14,7 @@ * limitations under the License. */ -import { Action, defineAction } from '@genkit-ai/core'; +import { Action, GenkitError, defineAction } from '@genkit-ai/core'; import { lookupAction } from '@genkit-ai/core/registry'; import * as z from 'zod'; import { Document, DocumentData, DocumentDataSchema } from './document.js'; @@ -294,3 +294,106 @@ export function indexerRef< ): IndexerReference { return { ...options }; } + +function itemToDocument( + item: any, + options: SimpleRetrieverOptions +): Document { + if (!item) + throw new GenkitError({ + status: 'INVALID_ARGUMENT', + message: `Items returned from simple retriever must be non-null.`, + }); + if (typeof item === 'string') return Document.fromText(item); + if (typeof options.content === 'function') { + const transformed = options.content(item); + return typeof transformed === 'string' + ? Document.fromText(transformed) + : new Document({ content: transformed }); + } + if (typeof options.content === 'string' && typeof item === 'object') + return Document.fromText(item[options.content]); + throw new GenkitError({ + status: 'INVALID_ARGUMENT', + message: `Cannot convert item to document without content option. Item: ${JSON.stringify(item)}`, + }); +} + +function itemToMetadata( + item: any, + options: SimpleRetrieverOptions +): Document['metadata'] { + if (typeof item === 'string') return undefined; + if (Array.isArray(options.metadata) && typeof item === 'object') { + const out: Record = {}; + options.metadata.forEach((key) => (out[key] = item[key])); + } + if (typeof options.metadata === 'function') return options.metadata(item); + if (!options.metadata && typeof item === 'object') { + const out = { ...item }; + if (typeof options.content === 'string') delete out[options.content]; + return out; + } + throw new GenkitError({ + status: 'INVALID_ARGUMENT', + message: `Unable to extract metadata from item with supplied options. Item: ${JSON.stringify(item)}`, + }); +} + +export interface SimpleRetrieverOptions< + C extends z.ZodTypeAny = z.ZodTypeAny, + R = any, +> { + /** The name of the retriever you're creating. */ + name: string; + /** A Zod schema containing any configuration info available beyond the query. */ + configSchema?: C; + /** + * Specifies how to extract content from the returned items. + * + * - If a string, specifies the key of the returned item to extract as content. + * - If a function, allows you to extract content as text or a document part. + **/ + content?: string | ((item: R) => Document['content'] | string); + /** + * Specifies how to extract metadata from the returned items. + * + * - If an array of strings, specifies list of keys to extract from returned objects. + * - If a function, allows you to use custom behavior to extract metadata from returned items. + */ + metadata?: string[] | ((item: R) => Document['metadata']); +} + +/** + * defineSimpleRetriever makes it easy to map existing data into documents that + * can be used for prompt augmentation. + * + * @param options Configuration options for the retriever. + * @param handler A function that queries a datastore and returns items from which to extract documents. + * @returns A Genkit retriever. + */ +export function defineSimpleRetriever< + C extends z.ZodTypeAny = z.ZodTypeAny, + R = any, +>( + options: SimpleRetrieverOptions, + handler: (query: Document, config: z.infer) => Promise +) { + return defineRetriever( + { + name: options.name, + configSchema: options.configSchema, + }, + async (query, config) => { + const result = await handler(query, config); + return { + documents: result.map((item) => { + const doc = itemToDocument(item, options); + if (typeof item !== 'string') + doc.metadata = itemToMetadata(item, options); + return doc; + }), + }; + } + ); +}