langchain-ai · jacoblee93 · Nov 20, 2023 · Nov 17, 2023 · Nov 17, 2023 · Nov 17, 2023
diff --git a/docs/api_refs/typedoc.json b/docs/api_refs/typedoc.json
@@ -229,6 +229,7 @@
     "../../langchain/src/retrievers/self_query/pinecone.ts",
     "../../langchain/src/retrievers/self_query/supabase.ts",
     "../../langchain/src/retrievers/self_query/weaviate.ts",
+    "../../langchain/src/retrievers/self_query/vectara.ts",
     "../../langchain/src/retrievers/vespa.ts",
     "../../langchain/src/cache/index.ts",
     "../../langchain/src/cache/cloudflare_kv.ts",

diff --git a/...ocs/modules/data_connection/retrievers/how_to/self_query/vectara-self-query.mdx b/...ocs/modules/data_connection/retrievers/how_to/self_query/vectara-self-query.mdx
@@ -0,0 +1,39 @@
+# Vectara Self Query Retriever
+
+This example shows how to use a self query retriever with a [Vectara](https://vectara.com/) vector store.
+
+If you haven't already set up Vectara, please [follow the instructions here](/docs/integrations/vectorstores/vectara.mdx).
+
+## Usage
+
+This example shows how to intialize a `SelfQueryRetriever` with a vector store:
+
+import CodeBlock from "@theme/CodeBlock";
+import Example from "@examples/retrievers/vectara_self_query.ts";
+
+<CodeBlock language="typescript">{Example}</CodeBlock>
+
+You can also initialize the retriever with default search parameters that apply in
+addition to the generated query:
+
+```typescript
+const selfQueryRetriever = await SelfQueryRetriever.fromLLM({
+  llm,
+  vectorStore,
+  documentContents,
+  attributeInfo,
+  /**
+   * We need to use a translator that translates the queries into a
+   * filter format that the vector store can understand. LangChain provides one here.
+   */
+  structuredQueryTranslator: new VectaraTranslator()(),
+  searchParams: {
+    filter: {
+      filter: "( doc.genre = 'science fiction' ) and ( doc.rating > 8.5 )",
+    },
+    mergeFiltersOperator: "and",
+  },
+});
+```
+
+See the [official docs](https://docs.vectara.com/) for more on how to construct metadata filters.
diff --git a/examples/src/retrievers/vectara_self_query.ts b/examples/src/retrievers/vectara_self_query.ts
@@ -0,0 +1,137 @@
+import { AttributeInfo } from "langchain/schema/query_constructor";
+import { Document } from "langchain/document";
+import { SelfQueryRetriever } from "langchain/retrievers/self_query";
+
+import { OpenAI } from "langchain/llms/openai";
+import { VectaraStore } from "langchain/vectorstores/vectara";
+import { VectaraTranslator } from "langchain/retrievers/self_query/vectara";
+import { FakeEmbeddings } from "langchain/embeddings/fake";
+/**
+ * First, we create a bunch of documents. You can load your own documents here instead.
+ * Each document has a pageContent and a metadata field. Make sure your metadata matches the AttributeInfo below.
+ */
+const docs = [
+  new Document({
+    pageContent:
+      "A bunch of scientists bring back dinosaurs and mayhem breaks loose",
+    metadata: { year: 1993, rating: 7.7, genre: "science fiction" },
+  }),
+  new Document({
+    pageContent:
+      "Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
+    metadata: { year: 2010, director: "Christopher Nolan", rating: 8.2 },
+  }),
+  new Document({
+    pageContent:
+      "A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
+    metadata: { year: 2006, director: "Satoshi Kon", rating: 8.6 },
+  }),
+  new Document({
+    pageContent:
+      "A bunch of normal-sized women are supremely wholesome and some men pine after them",
+    metadata: { year: 2019, director: "Greta Gerwig", rating: 8.3 },
+  }),
+  new Document({
+    pageContent: "Toys come alive and have a blast doing so",
+    metadata: { year: 1995, genre: "animated" },
+  }),
+  new Document({
+    pageContent: "Three men walk into the Zone, three men walk out of the Zone",
+    metadata: {
+      year: 1979,
+      rating: 9.9,
+      director: "Andrei Tarkovsky",
+      genre: "science fiction",
+    },
+  }),
+];
+
+/**
+ * Next, we define the attributes we want to be able to query on.
+ * in this case, we want to be able to query on the genre, year, director, rating, and length of the movie.
+ * We also provide a description of each attribute and the type of the attribute.
+ * This is used to generate the query prompts.
+ *
+ * We need to setup the filters in the vectara as well otherwise filter won't work.
+ * To setup the filter in vectara, go to Data -> {your_created_corpus} -> overview
+ * In the overview section edit the filters section and all the following attributes in
+ * the filters.
+ */
+const attributeInfo: AttributeInfo[] = [
+  {
+    name: "genre",
+    description: "The genre of the movie",
+    type: "string or array of strings",
+  },
+  {
+    name: "year",
+    description: "The year the movie was released",
+    type: "number",
+  },
+  {
+    name: "director",
+    description: "The director of the movie",
+    type: "string",
+  },
+  {
+    name: "rating",
+    description: "The rating of the movie (1-10)",
+    type: "number",
+  },
+];
+
+/**
+ * Next, we instantiate a vector store. This is where we store the embeddings of the documents.
+ * We also need to provide an embeddings object. This is used to embed the documents.
+ */
+
+const config = {
+  customerId: Number(process.env.VECTARA_CUSTOMER_ID),
+  corpusId: Number(process.env.VECTARA_CORPUS_ID),
+  apiKey: String(process.env.VECTARA_API_KEY),
+  verbose: true,
+};
+
+const vectorStore = await VectaraStore.fromDocuments(
+  docs,
+  new FakeEmbeddings(),
+  config
+);
+
+const llm = new OpenAI();
+const documentContents = "Brief summary of a movie";
+
+const selfQueryRetriever = await SelfQueryRetriever.fromLLM({
+  llm,
+  vectorStore,
+  documentContents,
+  attributeInfo,
+  /**
+   * We need to create a basic translator that translates the queries into a
+   * filter format that the vector store can understand. We provide a basic translator
+   * here, but you can create your own translator by extending BaseTranslator
+   * abstract class. Note that the vector store needs to support filtering on the metadata
+   * attributes you want to query on.
+   */
+  structuredQueryTranslator: new VectaraTranslator(),
+});
+
+/**
+ * Now we can query the vector store.
+ * We can ask questions like "Which movies are less than 90 minutes?" or "Which movies are rated higher than 8.5?".
+ * We can also ask questions like "Which movies are either comedy or drama and are less than 90 minutes?".
+ * The retriever will automatically convert these questions into queries that can be used to retrieve documents.
+ */
+const query1 = await selfQueryRetriever.getRelevantDocuments(
+  "What are some movies about dinosaurs"
+);
+const query2 = await selfQueryRetriever.getRelevantDocuments(
+  "I want to watch a movie rated higher than 8.5"
+);
+const query3 = await selfQueryRetriever.getRelevantDocuments(
+  "Which movies are directed by Greta Gerwig?"
+);
+const query4 = await selfQueryRetriever.getRelevantDocuments(
+  "Which movies are either comedy or science fiction and are rated higher than 8.5?"
+);
+console.log(query1, query2, query3, query4);
diff --git a/langchain/.gitignore b/langchain/.gitignore
@@ -631,6 +631,9 @@ retrievers/self_query/supabase.d.ts
 retrievers/self_query/weaviate.cjs
 retrievers/self_query/weaviate.js
 retrievers/self_query/weaviate.d.ts
+retrievers/self_query/vectara.cjs
+retrievers/self_query/vectara.js
+retrievers/self_query/vectara.d.ts
 retrievers/vespa.cjs
 retrievers/vespa.js
 retrievers/vespa.d.ts

diff --git a/langchain/package.json b/langchain/package.json
@@ -643,6 +643,9 @@
     "retrievers/self_query/weaviate.cjs",
     "retrievers/self_query/weaviate.js",
     "retrievers/self_query/weaviate.d.ts",
+    "retrievers/self_query/vectara.cjs",
+    "retrievers/self_query/vectara.js",
+    "retrievers/self_query/vectara.d.ts",
     "retrievers/vespa.cjs",
     "retrievers/vespa.js",
     "retrievers/vespa.d.ts",
@@ -2454,6 +2457,11 @@
       "import": "./retrievers/self_query/weaviate.js",
       "require": "./retrievers/self_query/weaviate.cjs"
     },
+    "./retrievers/self_query/vectara": {
+      "types": "./retrievers/self_query/vectara.d.ts",
+      "import": "./retrievers/self_query/vectara.js",
+      "require": "./retrievers/self_query/vectara.cjs"
+    },
     "./retrievers/vespa": {
       "types": "./retrievers/vespa.d.ts",
       "import": "./retrievers/vespa.js",

diff --git a/langchain/scripts/create-entrypoints.js b/langchain/scripts/create-entrypoints.js
@@ -248,6 +248,7 @@ const entrypoints = {
   "retrievers/self_query/pinecone": "retrievers/self_query/pinecone",
   "retrievers/self_query/supabase": "retrievers/self_query/supabase",
   "retrievers/self_query/weaviate": "retrievers/self_query/weaviate",
+  "retrievers/self_query/vectara": "retrievers/self_query/vectara",
   "retrievers/vespa": "retrievers/vespa",
   // cache
   cache: "cache/index",
@@ -456,6 +457,7 @@ const requiresOptionalDependency = [
   "retrievers/self_query/pinecone",
   "retrievers/self_query/supabase",
   "retrievers/self_query/weaviate",
+  "retrievers/self_query/vectara",
   "output_parsers/expression",
   "chains/query_constructor",
   "chains/query_constructor/ir",

diff --git a/langchain/src/load/import_constants.ts b/langchain/src/load/import_constants.ts
@@ -130,6 +130,7 @@ export const optionalImportEntrypoints = [
   "langchain/retrievers/self_query/pinecone",
   "langchain/retrievers/self_query/supabase",
   "langchain/retrievers/self_query/weaviate",
+  "langchain/retrievers/self_query/vectara",
   "langchain/cache/cloudflare_kv",
   "langchain/cache/momento",
   "langchain/cache/redis",

diff --git a/langchain/src/load/import_type.d.ts b/langchain/src/load/import_type.d.ts
@@ -388,6 +388,9 @@ export interface OptionalImportMap {
   "langchain/retrievers/self_query/weaviate"?:
     | typeof import("../retrievers/self_query/weaviate.js")
     | Promise<typeof import("../retrievers/self_query/weaviate.js")>;
+  "langchain/retrievers/self_query/vectara"?:
+    | typeof import("../retrievers/self_query/vectara.js")
+    | Promise<typeof import("../retrievers/self_query/vectara.js")>;
   "langchain/cache/cloudflare_kv"?:
     | typeof import("../cache/cloudflare_kv.js")
     | Promise<typeof import("../cache/cloudflare_kv.js")>;

diff --git a/langchain/src/retrievers/self_query/tests/vectara_self_query.init.test.ts b/langchain/src/retrievers/self_query/tests/vectara_self_query.init.test.ts
@@ -0,0 +1,113 @@
+/* eslint-disable no-process-env */
+import { test } from "@jest/globals";
+import { Document } from "../../../document.js";
+import { AttributeInfo } from "../../../schema/query_constructor.js";
+import { SelfQueryRetriever } from "../index.js";
+import { OpenAI } from "../../../llms/openai.js";
+import { VectaraTranslator } from "../vectara.js";
+import { FakeEmbeddings } from "../../../embeddings/fake.js";
+import { VectaraStore } from "../../../vectorstores/vectara.js";
+
+test.skip("Vectara Self Query Retriever Test", async () => {
+  const docs = [
+    new Document({
+      pageContent:
+        "A bunch of scientists bring back dinosaurs and mayhem breaks loose",
+      metadata: { year: 1993, rating: 7.7, genre: "science fiction" },
+    }),
+    new Document({
+      pageContent:
+        "Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
+      metadata: { year: 2010, director: "Christopher Nolan", rating: 8.2 },
+    }),
+    new Document({
+      pageContent:
+        "A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
+      metadata: { year: 2006, director: "Satoshi Kon", rating: 8.6 },
+    }),
+    new Document({
+      pageContent:
+        "A bunch of normal-sized women are supremely wholesome and some men pine after them",
+      metadata: { year: 2019, director: "Greta Gerwig", rating: 8.3 },
+    }),
+    new Document({
+      pageContent: "Toys come alive and have a blast doing so",
+      metadata: { year: 1995, genre: "animated" },
+    }),
+    new Document({
+      pageContent:
+        "Three men walk into the Zone, three men walk out of the Zone",
+      metadata: {
+        year: 1979,
+        rating: 9.9,
+        director: "Andrei Tarkovsky",
+        genre: "science fiction",
+      },
+    }),
+  ];
+
+  const attributeInfo: AttributeInfo[] = [
+    {
+      name: "genre",
+      description: "The genre of the movie",
+      type: "string or array of strings",
+    },
+    {
+      name: "year",
+      description: "The year the movie was released",
+      type: "number",
+    },
+    {
+      name: "director",
+      description: "The director of the movie",
+      type: "string",
+    },
+    {
+      name: "rating",
+      description: "The rating of the movie (1-10)",
+      type: "number",
+    },
+  ];
+  const config = {
+    customerId: Number(process.env.VECTARA_CUSTOMER_ID),
+    corpusId: Number(process.env.VECTARA_CORPUS_ID),
+    apiKey: String(process.env.VECTARA_API_KEY),
+    verbose: true,
+  };
+
+  const vectorStore = await VectaraStore.fromDocuments(
+    docs,
+    new FakeEmbeddings(),
+    config
+  );
+
+  const llm = new OpenAI();
+  const documentContents = "Brief summary of a movie";
+
+  const selfQueryRetriever = await SelfQueryRetriever.fromLLM({
+    llm,
+    vectorStore,
+    documentContents,
+    attributeInfo,
+
+    structuredQueryTranslator: new VectaraTranslator(),
+  });
+
+  const query1 = await selfQueryRetriever.getRelevantDocuments(
+    "I want to watch a movie rated higher than 8.5"
+  );
+  const query2 = await selfQueryRetriever.getRelevantDocuments(
+    "Which movies are directed by Greta Gerwig?"
+  );
+  const query3 = await selfQueryRetriever.getRelevantDocuments(
+    "Which movies are either comedy or science fiction and are rated higher than 8.5?"
+  );
+  const query4 = await selfQueryRetriever.getRelevantDocuments(
+    "Wau wau wau wau hello gello hello?"
+  );
+  console.log(query1, query2, query3, query4);
+  expect(query1.length).toBe(2);
+  expect(query2.length).toBe(1);
+  expect(query3.length).toBe(1);
+  expect(query4.length).toBe(0);
+});