From 5393e6b4bf46e3e0ed33711e98d2fa36df4e7a46 Mon Sep 17 00:00:00 2001 From: Michael Bleigh Date: Sat, 11 May 2024 14:08:50 -0700 Subject: [PATCH 1/5] Adds defineSimpleRetriever and docs. --- docs/rag.md | 37 +++++++++-- js/ai/src/retriever.ts | 2 + js/ai/src/simpleRetriever.ts | 122 +++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 4 deletions(-) create mode 100644 js/ai/src/simpleRetriever.ts diff --git a/docs/rag.md b/docs/rag.md index fb60fd7ee2..6165e99966 100644 --- a/docs/rag.md +++ b/docs/rag.md @@ -303,10 +303,39 @@ export const ragFlow = defineFlow( It's also possible to create your own retriever. This is useful if your documents are managed in a document store that is not supported in Genkit (eg: -MySQL, Google Drive, etc.). The Genkit SDK provides a flexible `defineRetriever` -method that lets you provide custom code for fetching documents. You can also -define custom retrievers that build on top of existing retrievers in Genkit and -apply advanced RAG techniques (such as reranking or prompt extensions) on top. +MySQL, Google Drive, etc.). The Genkit SDK provides flexible methods that let +you provide custom code for fetching documents. You can also define custom +retrievers that build on top of existing retrievers in Genkit and apply advanced +RAG techniques (such as reranking or prompt extensions) on top. + +### Simple Retrievers + +Simple retrievers let you easily convert existing code into retrievers: + +```javascript +import { + defineSimpleRetriever, + retrieve +} from "@genkit-ai/ai/retriever"; +import { searchEmails } from "./db"; +import { z } from "zod"; + +defineSimpleRetriever({ + name: 'myDatabase', + configSchema: z.object({ + limit: z.number().optional() + }).optional(), + // we'll extract "message" from the returned email item + content: 'message', + // and several keys to use as metadata + metadata: ['from', 'to', 'subject'], +} async (query, config) => { + const result = await searchEmails(query.text(), {limit: config.limit}); + return result.data.emails; +}); +``` + +### Custom Retrievers ```javascript import { diff --git a/js/ai/src/retriever.ts b/js/ai/src/retriever.ts index 1ea4d9a4e8..a980716123 100644 --- a/js/ai/src/retriever.ts +++ b/js/ai/src/retriever.ts @@ -294,3 +294,5 @@ export function indexerRef< ): IndexerReference { return { ...options }; } + +export { defineSimpleRetriever } from './simpleRetriever.js'; diff --git a/js/ai/src/simpleRetriever.ts b/js/ai/src/simpleRetriever.ts new file mode 100644 index 0000000000..a46874f35b --- /dev/null +++ b/js/ai/src/simpleRetriever.ts @@ -0,0 +1,122 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { GenkitError } from '@genkit-ai/core'; +import { z } from 'zod'; +import { Document, defineRetriever } from './retriever.js'; + +function itemToDocument( + item: any, + options: SimpleRetrieverOptions +): Document { + if (!item) + throw new GenkitError({ + status: 'INVALID_ARGUMENT', + message: `Items returned from simple retriever must be non-null.`, + }); + if (typeof item === 'string') return Document.fromText(item); + if (typeof options.content === 'function') { + const transformed = content(item); + return typeof transformed === 'string' + ? Document.fromText(transformed) + : new Document({ content: transformed }); + } + if (typeof options.content === 'string' && typeof item === 'object') + return new Document(item[options.content]); + throw new GenkitError({ + status: 'INVALID_ARGUMENT', + message: `Cannot convert item to document without content option. Item: ${JSON.stringify(item)}`, + }); +} + +function itemToMetadata( + item: any, + options: SimpleRetrieverOptions +): Document['metadata'] { + if (typeof item === 'string') return undefined; + if (Array.isArray(options.metadata) && typeof item === 'object') { + const out: Record = {}; + options.metadata.forEach((key) => (out[key] = item[key])); + } + if (typeof options.metadata === 'function') return options.metadata(item); + if (!options.metadata && typeof item === 'object') { + const out = { ...item }; + if (typeof options.content === 'string') delete out[options.content]; + return out; + } + throw new GenkitError({ + status: 'INVALID_ARGUMENT', + message: `Unable to extract metadata from item with supplied options. Item: ${JSON.stringify(item)}`, + }); +} + +export interface SimpleRetrieverOptions< + C extends z.ZodTypeAny = z.ZodTypeAny, + R = any, +> { + /** The name of the retriever you're creating. */ + name: string; + /** A Zod schema containing any configuration info available beyond the query. */ + configSchema?: C; + /** + * Specifies how to extract content from the returned items. + * + * - If a string, specifies the key of the returned item to extract as content. + * - If a function, allows you to extract content as text or a document part. + **/ + content?: string | ((item: R) => Document['content'] | string); + /** + * Specifies how to extract metadata from the returned items. + * + * - If an array of strings, specifies list of keys to extract from returned objects. + * - If a function, allows you to use custom behavior to extract metadata from returned items. + */ + metadata?: string[] | ((item: R) => Document['metadata']); +} + +/** + * defineSimpleRetriever makes it easy to map existing data into documents that + * can be used for prompt augmentation. + * + * @param options Configuration options for the retriever. + * @param handler A function that queries a datastore and returns items from which to extract documents. + * @returns A Genkit retriever. + */ +export function defineSimpleRetriever< + C extends z.ZodTypeAny = z.ZodTypeAny, + R = any, +>( + options: SimpleRetrieverOptions, + handler: (query: Document, config: z.infer) => Promise +) { + return defineRetriever( + { + name: options.name, + configSchema: options.configSchema, + }, + async (query, config) => { + const result = await handler(query, config); + return { + documents: result.map((item) => { + const doc = itemToDocument(item, options); + if (typeof item !== 'string') + doc.metadata = itemToMetadata(item, options); + return doc; + }), + }; + } + ); +} From 3e41d95eee3c184ffb74f0e188c7b31e5f01afaa Mon Sep 17 00:00:00 2001 From: Michael Bleigh Date: Sat, 11 May 2024 14:12:48 -0700 Subject: [PATCH 2/5] fix --- js/ai/src/simpleRetriever.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/ai/src/simpleRetriever.ts b/js/ai/src/simpleRetriever.ts index a46874f35b..cff591dd4b 100644 --- a/js/ai/src/simpleRetriever.ts +++ b/js/ai/src/simpleRetriever.ts @@ -29,7 +29,7 @@ function itemToDocument( }); if (typeof item === 'string') return Document.fromText(item); if (typeof options.content === 'function') { - const transformed = content(item); + const transformed = options.content(item); return typeof transformed === 'string' ? Document.fromText(transformed) : new Document({ content: transformed }); From 4b6e42fa125160cfa50041d80822700a405f1494 Mon Sep 17 00:00:00 2001 From: Michael Bleigh Date: Sat, 11 May 2024 14:15:06 -0700 Subject: [PATCH 3/5] fix --- js/ai/src/simpleRetriever.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/ai/src/simpleRetriever.ts b/js/ai/src/simpleRetriever.ts index cff591dd4b..794158bfee 100644 --- a/js/ai/src/simpleRetriever.ts +++ b/js/ai/src/simpleRetriever.ts @@ -35,7 +35,7 @@ function itemToDocument( : new Document({ content: transformed }); } if (typeof options.content === 'string' && typeof item === 'object') - return new Document(item[options.content]); + return Document.fromText(item[options.content]); throw new GenkitError({ status: 'INVALID_ARGUMENT', message: `Cannot convert item to document without content option. Item: ${JSON.stringify(item)}`, From d4e40caa703152bdb2590eaad179878bb5d3bb78 Mon Sep 17 00:00:00 2001 From: Michael Bleigh Date: Sat, 11 May 2024 14:30:56 -0700 Subject: [PATCH 4/5] circular dep --- js/ai/src/retriever.ts | 105 +++++++++++++++++++++++++++++- js/ai/src/simpleRetriever.ts | 122 ----------------------------------- 2 files changed, 103 insertions(+), 124 deletions(-) delete mode 100644 js/ai/src/simpleRetriever.ts diff --git a/js/ai/src/retriever.ts b/js/ai/src/retriever.ts index a980716123..e3c6a89240 100644 --- a/js/ai/src/retriever.ts +++ b/js/ai/src/retriever.ts @@ -14,7 +14,7 @@ * limitations under the License. */ -import { Action, defineAction } from '@genkit-ai/core'; +import { Action, GenkitError, defineAction } from '@genkit-ai/core'; import { lookupAction } from '@genkit-ai/core/registry'; import * as z from 'zod'; import { Document, DocumentData, DocumentDataSchema } from './document.js'; @@ -295,4 +295,105 @@ export function indexerRef< return { ...options }; } -export { defineSimpleRetriever } from './simpleRetriever.js'; +function itemToDocument( + item: any, + options: SimpleRetrieverOptions +): Document { + if (!item) + throw new GenkitError({ + status: 'INVALID_ARGUMENT', + message: `Items returned from simple retriever must be non-null.`, + }); + if (typeof item === 'string') return Document.fromText(item); + if (typeof options.content === 'function') { + const transformed = options.content(item); + return typeof transformed === 'string' + ? Document.fromText(transformed) + : new Document({ content: transformed }); + } + if (typeof options.content === 'string' && typeof item === 'object') + return Document.fromText(item[options.content]); + throw new GenkitError({ + status: 'INVALID_ARGUMENT', + message: `Cannot convert item to document without content option. Item: ${JSON.stringify(item)}`, + }); +} + +function itemToMetadata( + item: any, + options: SimpleRetrieverOptions +): Document['metadata'] { + if (typeof item === 'string') return undefined; + if (Array.isArray(options.metadata) && typeof item === 'object') { + const out: Record = {}; + options.metadata.forEach((key) => (out[key] = item[key])); + } + if (typeof options.metadata === 'function') return options.metadata(item); + if (!options.metadata && typeof item === 'object') { + const out = { ...item }; + if (typeof options.content === 'string') delete out[options.content]; + return out; + } + throw new GenkitError({ + status: 'INVALID_ARGUMENT', + message: `Unable to extract metadata from item with supplied options. Item: ${JSON.stringify(item)}`, + }); +} + +export interface SimpleRetrieverOptions< + C extends z.ZodTypeAny = z.ZodTypeAny, + R = any, +> { + /** The name of the retriever you're creating. */ + name: string; + /** A Zod schema containing any configuration info available beyond the query. */ + configSchema?: C; + /** + * Specifies how to extract content from the returned items. + * + * - If a string, specifies the key of the returned item to extract as content. + * - If a function, allows you to extract content as text or a document part. + **/ + content?: string | ((item: R) => Document['content'] | string); + /** + * Specifies how to extract metadata from the returned items. + * + * - If an array of strings, specifies list of keys to extract from returned objects. + * - If a function, allows you to use custom behavior to extract metadata from returned items. + */ + metadata?: string[] | ((item: R) => Document['metadata']); +} + +/** + * defineSimpleRetriever makes it easy to map existing data into documents that + * can be used for prompt augmentation. + * + * @param options Configuration options for the retriever. + * @param handler A function that queries a datastore and returns items from which to extract documents. + * @returns A Genkit retriever. + */ +export function defineSimpleRetriever< + C extends z.ZodTypeAny = z.ZodTypeAny, + R = any, +>( + options: SimpleRetrieverOptions, + handler: (query: Document, config: z.infer) => Promise +) { + return defineRetriever( + { + name: options.name, + configSchema: options.configSchema, + }, + async (query, config) => { + const result = await handler(query, config); + return { + documents: result.map((item) => { + const doc = itemToDocument(item, options); + if (typeof item !== 'string') + doc.metadata = itemToMetadata(item, options); + return doc; + }), + }; + } + ); +} diff --git a/js/ai/src/simpleRetriever.ts b/js/ai/src/simpleRetriever.ts deleted file mode 100644 index 794158bfee..0000000000 --- a/js/ai/src/simpleRetriever.ts +++ /dev/null @@ -1,122 +0,0 @@ -/** - * Copyright 2024 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import { GenkitError } from '@genkit-ai/core'; -import { z } from 'zod'; -import { Document, defineRetriever } from './retriever.js'; - -function itemToDocument( - item: any, - options: SimpleRetrieverOptions -): Document { - if (!item) - throw new GenkitError({ - status: 'INVALID_ARGUMENT', - message: `Items returned from simple retriever must be non-null.`, - }); - if (typeof item === 'string') return Document.fromText(item); - if (typeof options.content === 'function') { - const transformed = options.content(item); - return typeof transformed === 'string' - ? Document.fromText(transformed) - : new Document({ content: transformed }); - } - if (typeof options.content === 'string' && typeof item === 'object') - return Document.fromText(item[options.content]); - throw new GenkitError({ - status: 'INVALID_ARGUMENT', - message: `Cannot convert item to document without content option. Item: ${JSON.stringify(item)}`, - }); -} - -function itemToMetadata( - item: any, - options: SimpleRetrieverOptions -): Document['metadata'] { - if (typeof item === 'string') return undefined; - if (Array.isArray(options.metadata) && typeof item === 'object') { - const out: Record = {}; - options.metadata.forEach((key) => (out[key] = item[key])); - } - if (typeof options.metadata === 'function') return options.metadata(item); - if (!options.metadata && typeof item === 'object') { - const out = { ...item }; - if (typeof options.content === 'string') delete out[options.content]; - return out; - } - throw new GenkitError({ - status: 'INVALID_ARGUMENT', - message: `Unable to extract metadata from item with supplied options. Item: ${JSON.stringify(item)}`, - }); -} - -export interface SimpleRetrieverOptions< - C extends z.ZodTypeAny = z.ZodTypeAny, - R = any, -> { - /** The name of the retriever you're creating. */ - name: string; - /** A Zod schema containing any configuration info available beyond the query. */ - configSchema?: C; - /** - * Specifies how to extract content from the returned items. - * - * - If a string, specifies the key of the returned item to extract as content. - * - If a function, allows you to extract content as text or a document part. - **/ - content?: string | ((item: R) => Document['content'] | string); - /** - * Specifies how to extract metadata from the returned items. - * - * - If an array of strings, specifies list of keys to extract from returned objects. - * - If a function, allows you to use custom behavior to extract metadata from returned items. - */ - metadata?: string[] | ((item: R) => Document['metadata']); -} - -/** - * defineSimpleRetriever makes it easy to map existing data into documents that - * can be used for prompt augmentation. - * - * @param options Configuration options for the retriever. - * @param handler A function that queries a datastore and returns items from which to extract documents. - * @returns A Genkit retriever. - */ -export function defineSimpleRetriever< - C extends z.ZodTypeAny = z.ZodTypeAny, - R = any, ->( - options: SimpleRetrieverOptions, - handler: (query: Document, config: z.infer) => Promise -) { - return defineRetriever( - { - name: options.name, - configSchema: options.configSchema, - }, - async (query, config) => { - const result = await handler(query, config); - return { - documents: result.map((item) => { - const doc = itemToDocument(item, options); - if (typeof item !== 'string') - doc.metadata = itemToMetadata(item, options); - return doc; - }), - }; - } - ); -} From 7318f7bdf93683c3b5cceef60281b18f040a8bee Mon Sep 17 00:00:00 2001 From: Michael Bleigh Date: Sun, 12 May 2024 10:23:58 -0700 Subject: [PATCH 5/5] Update docs/rag.md Co-authored-by: Pavel Jbanov --- docs/rag.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/rag.md b/docs/rag.md index 6165e99966..e8fc8c7b86 100644 --- a/docs/rag.md +++ b/docs/rag.md @@ -316,9 +316,9 @@ Simple retrievers let you easily convert existing code into retrievers: import { defineSimpleRetriever, retrieve -} from "@genkit-ai/ai/retriever"; -import { searchEmails } from "./db"; -import { z } from "zod"; +} from '@genkit-ai/ai/retriever'; +import { searchEmails } from './db'; +import { z } from 'zod'; defineSimpleRetriever({ name: 'myDatabase',