From 306f13b3e0f0b512ef5eedf7d732aa1e7867cc93 Mon Sep 17 00:00:00 2001 From: Kevin Cheung Date: Tue, 21 May 2024 16:21:44 -0700 Subject: [PATCH 1/5] Firebase plugin doc improvements --- docs/plugins/firebase.md | 42 +++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/docs/plugins/firebase.md b/docs/plugins/firebase.md index f5901a1b3..3465ce5fe 100644 --- a/docs/plugins/firebase.md +++ b/docs/plugins/firebase.md @@ -15,11 +15,21 @@ The Firebase plugin provides several integrations with Firebase services: npm i --save @genkit-ai/firebase ``` +## Prerequisites + +- All Firebase products require a Firebase project. You can create a new project + or enable Firebase in an existing Google Cloud project using the + [Firebase console](https://console.firebase.google.com/). +- In addition, if you want to deploy flows to Cloud Functions, you must + [upgrade your project](https://console.firebase.google.com/project/_/overview?purchaseBillingPlan=metered) + to the Blaze pay-as-you-go plan. + ## Configuration To use this plugin, specify it when you call `configureGenkit()`: ```js +import { configureGenkit } from '@genkit-ai/core'; import { firebase } from '@genkit-ai/firebase'; configureGenkit({ @@ -68,6 +78,7 @@ retrievers, `defineFirestoreRetriever()`: ```js import { defineFirestoreRetriever } from '@genkit-ai/firebase'; +import { retrieve } from '@genkit-ai/ai/retriever'; import { initializeApp } from 'firebase-admin/app'; import { getFirestore } from 'firebase-admin/firestore'; @@ -75,12 +86,12 @@ const app = initializeApp(); const firestore = getFirestore(app); const yourRetrieverRef = defineFirestoreRetriever({ - name: 'yourRetriever', + name: 'yourRetriever', // For display in the developer UI firestore: getFirestore(app), collection: 'yourCollection', - contentField: 'yourDataChunks', + contentField: 'yourContentField', vectorField: 'embedding', - embedder: textEmbeddingGecko, + embedder: textEmbeddingGecko, // Import from '@genkit-ai/googleai' or '@genkit-ai/vertexai' distanceMeasure: 'COSINE', // 'EUCLIDEAN', 'DOT_PRODUCT', or 'COSINE' (default) }); ``` @@ -91,11 +102,18 @@ To use it, pass it to the `retrieve()` function: const docs = await retrieve({ retriever: yourRetrieverRef, query: 'look for something', - config: { limit: 5 }, + options: { limit: 5 }, }); ``` -For indexing, use an embedding generator along with the Admin SDK: +Firestore depends on indexes to provide fast and efficient querying on +collections. The prior example requires the `embedding` field to be indexed to +work. To do so, call `retrieve()` and Firestore will throw an error with a +command to create an index. Execute that command and your index should be ready +to use. + +To populate your Firestore collection, use an embedding generator along with the +Admin SDK: ```js import { initializeApp } from 'firebase-admin'; @@ -108,7 +126,7 @@ const firestore = getFirestore(app); const indexConfig = { collection: 'yourCollection', - contentField: 'yourDataChunks', + contentField: 'yourContentField', vectorField: 'embedding', embedder: textEmbeddingGecko, }; @@ -125,14 +143,12 @@ async function indexToFirestore(content) { } ``` -Firestore depends on indexes to provide fast and efficient querying on -collections. The prior example requires the `embedding` field to be indexed to -work. To do so, invoke the function and Firestore will throw an error with a -command to create an index. Execute that command and your index should be ready -to use. +#### Learn more -See the [Retrieval-augmented generation](../rag.md) page for a general -discussion on indexers and retrievers. +- See the [Retrieval-augmented generation](../rag.md) page for a general + discussion on indexers and retrievers in Genkit. +- See [Search with vector embeddings](https://firebase.google.com/docs/firestore/vector-search) + in the Cloud Firestore docs for more on the vector search feature. ### Cloud Firestore trace storage From 4ad11cb1c1a15946453c60c3f60c797b480146cc Mon Sep 17 00:00:00 2001 From: Kevin Cheung Date: Tue, 21 May 2024 16:30:10 -0700 Subject: [PATCH 2/5] link to rag.md --- docs/plugins/firebase.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/plugins/firebase.md b/docs/plugins/firebase.md index 3465ce5fe..3f674006b 100644 --- a/docs/plugins/firebase.md +++ b/docs/plugins/firebase.md @@ -73,6 +73,11 @@ use together or individually. You can use Cloud Firestore as a vector store for RAG indexing and retrieval. +This section contains information specific to the `firebase` plugin and Cloud +Firestore's vector search feature. +See the [Retrieval-augmented generation](../rag.md) page for a general +discussion on implementing RAG using Genkit. + The `firebase` plugin provides a convenience function for defining Firestore retrievers, `defineFirestoreRetriever()`: From 1593deb9c41ac4ec9f824acc75f35e3a9d096759 Mon Sep 17 00:00:00 2001 From: Kevin Cheung Date: Wed, 22 May 2024 14:27:23 -0700 Subject: [PATCH 3/5] changes --- docs/plugins/firebase.md | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/docs/plugins/firebase.md b/docs/plugins/firebase.md index 3f674006b..7a03c8e67 100644 --- a/docs/plugins/firebase.md +++ b/docs/plugins/firebase.md @@ -75,7 +75,7 @@ You can use Cloud Firestore as a vector store for RAG indexing and retrieval. This section contains information specific to the `firebase` plugin and Cloud Firestore's vector search feature. -See the [Retrieval-augmented generation](../rag.md) page for a general +See the [Retrieval-augmented generation](../rag.md) page for a more detailed discussion on implementing RAG using Genkit. The `firebase` plugin provides a convenience function for defining Firestore @@ -111,12 +111,6 @@ const docs = await retrieve({ }); ``` -Firestore depends on indexes to provide fast and efficient querying on -collections. The prior example requires the `embedding` field to be indexed to -work. To do so, call `retrieve()` and Firestore will throw an error with a -command to create an index. Execute that command and your index should be ready -to use. - To populate your Firestore collection, use an embedding generator along with the Admin SDK: @@ -148,6 +142,19 @@ async function indexToFirestore(content) { } ``` +Firestore depends on indexes to provide fast and efficient querying on +collections. (Note that "index" here refers to database indexes, and not +Genkit's indexer and retriever abstractions.) + +The prior example requires the `embedding` field to be indexed to +work. To create the index: + +- Run the `gcloud` command described in the + [Create a single-field vector index](https://firebase.google.com/docs/firestore/vector-search?hl=en&authuser=0#create_and_manage_vector_indexes) + section of the Firestore docs. +- Alternatively, call `retrieve()` and Firestore will throw an error with the + correct command to create the index. + #### Learn more - See the [Retrieval-augmented generation](../rag.md) page for a general From 8215645ca67f12ace75f8f084896c7e1058c846a Mon Sep 17 00:00:00 2001 From: Kevin Cheung Date: Thu, 23 May 2024 11:35:27 -0700 Subject: [PATCH 4/5] indexer sample --- docs/plugins/firebase.md | 98 ++++++++++++++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 19 deletions(-) diff --git a/docs/plugins/firebase.md b/docs/plugins/firebase.md index 7a03c8e67..a3e4cd554 100644 --- a/docs/plugins/firebase.md +++ b/docs/plugins/firebase.md @@ -112,33 +112,81 @@ const docs = await retrieve({ ``` To populate your Firestore collection, use an embedding generator along with the -Admin SDK: +Admin SDK. For example, the menu ingestion script from the +[Retrieval-augmented generation](../rag.md) page could be adapted for Firestore +in the following way: ```js -import { initializeApp } from 'firebase-admin'; -import { getFirestore, FieldValue } from 'firebase-admin/firestore'; -import { textEmbeddingGecko } from '@genkit-ai/vertexai'; -import { embed } from '@genkit-ai/ai/embedder'; +import { configureGenkit } from "@genkit-ai/core"; +import { embed } from "@genkit-ai/ai/embedder"; +import { defineFlow, run } from "@genkit-ai/flow"; +import { textEmbeddingGecko, vertexAI } from "@genkit-ai/vertexai"; -const app = initializeApp(); -const firestore = getFirestore(app); +import { applicationDefault, initializeApp } from "firebase-admin/app"; +import { FieldValue, getFirestore } from "firebase-admin/firestore"; + +import { chunk } from "llm-chunk"; +import pdf from "pdf-parse"; +import * as z from "zod"; + +import { readFile } from "fs/promises"; +import path from "path"; const indexConfig = { - collection: 'yourCollection', - contentField: 'yourContentField', - vectorField: 'embedding', + collection: "menuInfo", + contentField: "text", + vectorField: "embedding", embedder: textEmbeddingGecko, }; -async function indexToFirestore(content) { - const embedding = await embed({ - embedder: indexConfig.embedder, - content, - }); - await firestore.collection(indexConfig.collection).add({ - [indexConfig.vectorField]: FieldValue.vector(embedding), - [indexConfig.contentField]: content, - }); +configureGenkit({ + plugins: [vertexAI({ location: "us-central1" })], + enableTracingAndMetrics: false, +}); + +const app = initializeApp({ credential: applicationDefault() }); +const firestore = getFirestore(app); + +export const indexMenu = defineFlow( + { + name: "indexMenu", + inputSchema: z.string().describe("PDF file path"), + outputSchema: z.void(), + }, + async (filePath: string) => { + filePath = path.resolve(filePath); + + // Read the PDF. + const pdfTxt = await run("extract-text", () => + extractTextFromPdf(filePath) + ); + + // Divide the PDF text into segments. + const chunks = await run("chunk-it", async () => chunk(pdfTxt)); + + // Add chunks to the index. + await run("index-chunks", async () => indexToFirestore(chunks)); + } +); + +async function indexToFirestore(data: string[]) { + for (const text of data) { + const embedding = await embed({ + embedder: indexConfig.embedder, + content: text, + }); + await firestore.collection(indexConfig.collection).add({ + [indexConfig.vectorField]: FieldValue.vector(embedding), + [indexConfig.contentField]: text, + }); + } +} + +async function extractTextFromPdf(filePath: string) { + const pdfFile = path.resolve(filePath); + const dataBuffer = await readFile(pdfFile); + const data = await pdf(dataBuffer); + return data.text; } ``` @@ -152,6 +200,18 @@ work. To create the index: - Run the `gcloud` command described in the [Create a single-field vector index](https://firebase.google.com/docs/firestore/vector-search?hl=en&authuser=0#create_and_manage_vector_indexes) section of the Firestore docs. + + The command looks like the following: + + ```posix-terminal + gcloud alpha firestore indexes composite create --project=your-project-id \ + --collection-group=yourCollectionName --query-scope=COLLECTION \ + --field-config=vector-config='{"dimension":"768","flat": "{}"}',field-path=yourEmbeddingField + ``` + + However, the correct indexing configuration depends on the queries you will + make and the embedding model you're using. + - Alternatively, call `retrieve()` and Firestore will throw an error with the correct command to create the index. From 11ee2d1a0653641a6b4be5c6892076eba7dafc29 Mon Sep 17 00:00:00 2001 From: Kevin Cheung Date: Thu, 23 May 2024 11:40:44 -0700 Subject: [PATCH 5/5] cfg comment --- docs/plugins/firebase.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/plugins/firebase.md b/docs/plugins/firebase.md index a3e4cd554..a1b522516 100644 --- a/docs/plugins/firebase.md +++ b/docs/plugins/firebase.md @@ -132,6 +132,7 @@ import * as z from "zod"; import { readFile } from "fs/promises"; import path from "path"; +// Change these values to match your Firestore config/schema const indexConfig = { collection: "menuInfo", contentField: "text",