patterns-ai-core · andreibondarev · Jun 24, 2023 · Jun 22, 2023 · Jun 23, 2023 · Jun 23, 2023
diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ require "langchain"
 | [Pinecone](https://www.pinecone.io/) | :white_check_mark: | :white_check_mark: | :white_check_mark: | WIP     | WIP               |
 | [Pgvector](https://github.com/pgvector/pgvector) | :white_check_mark: | :white_check_mark: | :white_check_mark: | WIP     | WIP               |
 | [Qdrant](https://qdrant.tech/) | :white_check_mark: | :white_check_mark: | :white_check_mark: | WIP     | WIP               |
-| [Weaviate](https://weaviate.io/) | :white_check_mark: | :white_check_mark: | :white_check_mark: | WIP     | WIP               |
+| [Weaviate](https://weaviate.io/) | :white_check_mark: | :white_check_mark: | :white_check_mark: | WIP     | :white_check_mark: |
 
 ### Using Vector Search Databases 🔍
 

diff --git a/lib/langchain.rb b/lib/langchain.rb
@@ -145,6 +145,10 @@ module Prompt
     autoload :FewShotPromptTemplate, "langchain/prompt/few_shot_prompt_template"
   end
 
+  module ActiveRecord
+    autoload :Hooks, "langchain/active_record/hooks"
+  end
+
   module OutputParsers
     autoload :Base, "langchain/output_parsers/base"
     autoload :StructuredOutputParser, "langchain/output_parsers/structured"
@@ -154,3 +158,5 @@ module Errors
     class BaseError < StandardError; end
   end
 end
+
+require "langchain/railtie" if defined?(Rails)
diff --git a/lib/langchain/active_record/hooks.rb b/lib/langchain/active_record/hooks.rb
@@ -0,0 +1,96 @@
+# frozen_string_literal: true
+
+module Langchain
+  module ActiveRecord
+    # This module adds the following functionality to your ActiveRecord models:
+    # * `vectorsearch` class method to set the vector search provider
+    # * `similarity_search` class method to search for similar texts
+    # * `upsert_to_vectorsearch` instance method to upsert the record to the vector search provider
+    #
+    # Usage:
+    #     class Recipe < ActiveRecord::Base
+    #       vectorsearch provider: Langchain::Vectorsearch::Weaviate.new(
+    #                    api_key: ENV["WEAVIATE_API_KEY"],
+    #                    url: ENV["WEAVIATE_URL"],
+    #                    index_name: "Recipes",
+    #                    llm: Langchain::LLM::OpenAI.new(api_key: ENV["OPENAI_API_KEY"])
+    #                 )
+    #
+    #       after_save :upsert_to_vectorsearch
+    #
+    #       # Overwriting how the model is serialized before it's indexed
+    #       def as_vector
+    #         [
+    #           "Title: #{title}",
+    #           "Description: #{description}",
+    #           ...
+    #         ]
+    #         .compact
+    #         .join("\n")
+    #       end
+    #     end
+    #
+    # Create the default schema
+    #     Recipe.class_variable_get(:@@provider).create_default_schema
+    # Query the vector search provider
+    #     Recipe.similarity_search("carnivore dish")
+    # Delete the default schema to start over
+    #     Recipe.class_variable_get(:@@provider).client.schema.delete class_name: "Recipes"
+    #
+    module Hooks
+      def self.included(base)
+        base.extend ClassMethods
+      end
+
+      # Index the text to the vector search provider
+      # You'd typically call this method in an ActiveRecord callback
+      #
+      # @return [Boolean] true
+      # @raise [Error] Indexing to vector search DB failed
+      def upsert_to_vectorsearch
+        if previously_new_record?
+          self.class.class_variable_get(:@@provider).add_texts(
+            texts: [as_vector],
+            ids: [id]
+          )
+        else
+          self.class.class_variable_get(:@@provider).update_texts(
+            texts: [as_vector],
+            ids: [id]
+          )
+        end
+      end
+
+      # Used to serialize the DB record to an indexable vector text
+      # Overwrite this method in your model to customize
+      #
+      # @return [String] the text representation of the model
+      def as_vector
+        to_json
+      end
+
+      module ClassMethods
+        # Set the vector search provider
+        #
+        # @param provider [Object] The `Langchain::Vectorsearch::*` instance
+        def vectorsearch(provider:)
+          class_variable_set(:@@provider, provider)
+        end
+
+        # Search for similar texts
+        #
+        # @param query [String] The query to search for
+        # @param k [Integer] The number of results to return
+        # @return [ActiveRecord::Relation] The ActiveRecord relation
+        def similarity_search(query, k: 1)
+          records = class_variable_get(:@@provider).similarity_search(
+            query: query,
+            k: k
+          )
+          ids = records.map { |record| record.dig("__id") }
+          where(id: ids)
+        end
+      end
+    end
+  end
+end
diff --git a/lib/langchain/railtie.rb b/lib/langchain/railtie.rb
@@ -0,0 +1,11 @@
+# frozen_string_literal: true
+
+module Langchain
+  class Railtie < Rails::Railtie
+    initializer "langchain" do
+      ActiveSupport.on_load(:active_record) do
+        ::ActiveRecord::Base.include Langchain::ActiveRecord::Hooks
+      end
+    end
+  end
+end
diff --git a/lib/langchain/vectorsearch/weaviate.rb b/lib/langchain/vectorsearch/weaviate.rb
@@ -14,7 +14,7 @@ class Weaviate < Base
     # Initialize the Weaviate adapter
     # @param url [String] The URL of the Weaviate instance
     # @param api_key [String] The API key to use
-    # @param index_name [String] The name of the index to use
+    # @param index_name [String] The capitalized name of the index to use
     # @param llm [Object] The LLM client to use
     def initialize(url:, api_key:, index_name:, llm:)
       depends_on "weaviate-ruby"
@@ -24,6 +24,9 @@ def initialize(url:, api_key:, index_name:, llm:)
         url: url,
         api_key: api_key
       )
+
+      # Weaviate requires the class name to be Capitalized: https://weaviate.io/developers/weaviate/configuration/schema-configuration#create-a-class
+      # TODO: Capitalize index_name
       @index_name = index_name
 
       super(llm: llm)
@@ -32,31 +35,51 @@ def initialize(url:, api_key:, index_name:, llm:)
     # Add a list of texts to the index
     # @param texts [Array] The list of texts to add
     # @return [Hash] The response from the server
-    def add_texts(texts:)
-      objects = Array(texts).map do |text|
-        {
-          class: index_name,
-          properties: {content: text},
-          vector: llm.embed(text: text)
-        }
-      end
-
+    def add_texts(texts:, ids:)
       client.objects.batch_create(
-        objects: objects
+        objects: weaviate_objects(texts, ids)
       )
     end
 
+    # Update a list of texts in the index
+    # @param texts [Array] The list of texts to update
+    # @return [Hash] The response from the server
+    def update_texts(texts:, ids:)
+      uuids = []
+
+      # Retrieve the UUIDs of the objects to update
+      Array(texts).map.with_index do |text, i|
+        record = client.query.get(
+          class_name: index_name,
+          fields: "_additional { id }",
+          where: "{ path: [\"__id\"], operator: Equal, valueString: \"#{ids[i]}\" }"
+        )
+        uuids.push record[0].dig("_additional", "id")
+      end
+
+      # Update the objects
+      texts.map.with_index do |text, i|
+        client.objects.update(
+          class_name: index_name,
+          id: uuids[i],
+          properties: {
+            __id: ids[i].to_s,
+            content: text
+          },
+          vector: llm.embed(text: text)
+        )
+      end
+    end
+
     # Create default schema
     def create_default_schema
       client.schema.create(
         class_name: index_name,
         vectorizer: "none",
         properties: [
-          # TODO: Allow passing in your own IDs
-          {
-            dataType: ["text"],
-            name: "content"
-          }
+          # __id to be used a pointer to the original document
+          {dataType: ["string"], name: "__id"}, # '_id' is a reserved property name (single underscore)
+          {dataType: ["text"], name: "content"}
         ]
       )
     end
@@ -82,7 +105,7 @@ def similarity_search_by_vector(embedding:, k: 4)
         class_name: index_name,
         near_vector: near_vector,
         limit: k.to_s,
-        fields: "content _additional { id }"
+        fields: "__id content _additional { id }"
       )
     end
 
@@ -101,5 +124,24 @@ def ask(question:)
 
       llm.chat(prompt: prompt)
     end
+
+    private
+
+    def weaviate_objects(texts, ids)
+      Array(texts).map.with_index do |text, i|
+        weaviate_object(text, ids[i])
+      end
+    end
+
+    def weaviate_object(text, id)
+      {
+        class: index_name,
+        properties: {
+          __id: id.to_s,
+          content: text
+        },
+        vector: llm.embed(text: text)
+      }
+    end
   end
 end
diff --git a/spec/fixtures/vectorsearch/weaviate_create_default_schema.json b/spec/fixtures/vectorsearch/weaviate_create_default_schema.json
@@ -12,23 +12,14 @@
       "removals": null
     }
   },
-  "moduleConfig": {
-    "text2vec-openai": {
-      "model": "ada",
-      "modelVersion": "002",
-      "type": "text",
-      "vectorizeClassName": true
-    }
-  },
   "properties": [
+    {
+      "dataType": ["string"],
+      "name": "__id",
+      "tokenization": "word"
+    },
     {
       "dataType": ["text"],
-      "moduleConfig": {
-        "text2vec-openai": {
-          "skip": false,
-          "vectorizePropertyName": false
-        }
-      },
       "name": "content",
       "tokenization": "word"
     }
@@ -70,5 +61,5 @@
     }
   },
   "vectorIndexType": "hnsw",
-  "vectorizer": "text2vec-openai"
+  "vectorizer": "none"
 }
diff --git a/spec/langchain/active_record/hooks_spec.rb b/spec/langchain/active_record/hooks_spec.rb
@@ -0,0 +1,17 @@
+# frozen_string_literal: true
+
+class Dummy
+  include Langchain::ActiveRecord::Hooks
+end
+
+RSpec.describe Langchain::ActiveRecord::Hooks do
+  it "responds to instance methods" do
+    expect(Dummy.new).to respond_to(:upsert_to_vectorsearch)
+    expect(Dummy.new).to respond_to(:as_vector)
+  end
+
+  it "responds to class methods" do
+    expect(Dummy).to respond_to(:vectorsearch)
+    expect(Dummy).to respond_to(:similarity_search)
+  end
+end