Feature: Index Ontologies metadata and content & Agents (#130)

* use standard SOLR in docker compose with no ontoportal old confgis * migrate ontology properties SOLR configuration to use Schema API * migrate ontology classes SOLR configuration to use Schema API * migrate provisional classes indexation to use Schema API and model hooks * update tests to handle the new indexation API * simplify the ontology properties index schema * update class and properties schema to use the existent dynamic names * index submission and ontologies metadata on save * index agents metadata * add ontology and agent metadata indexation tests * make agent, name , acronym, email and identifiers searchable * unindex ontology submission when archived * make ontology acronym and name searchable * update embedded ontology to all the fields and update submission in save * fix embed docs search tests * rename ontology unindex to unindex_all_data to prevent conflicts * implement index all ontology content * fix unescaping indexed properties naming * fix an issue after update RDF gem to 3.0 that frozen request params * add parallel processing the index_all_data step * clear indexed data after ontology delete * optimize index all data in Virtuoso and GraphDb by pre-fetching all ids - Before optimization - fs ⇒ 15.224490000051446s - ag ⇒ 19.238805999979377s - vo ⇒ 42.95274499990046s - gb ⇒ 33.52821200003382s - After optimization - fs ⇒ 15.369778999942355s - ag ⇒ 17.367580000078306s - vo ⇒ 16.564614000031725s - gb ⇒ 15.431716999970376s
ontoportal-lirmm · Mar 3, 2024 · d37aeaf · d37aeaf
1 parent cab072e
commit d37aeaf
Show file tree

Hide file tree

Showing 12 changed files with 496 additions and 140 deletions.
diff --git a/Gemfile b/Gemfile
@@ -21,6 +21,7 @@ gem 'rubyzip', '~> 1.0'
 gem 'thin'
 gem 'request_store'
 gem 'jwt'
+gem "parallel", "~> 1.24"
 
 # Testing
 group :test do
@@ -39,3 +40,4 @@ end
 # NCBO gems (can be from a local dev path or from rubygems/git)
 gem 'goo', github: 'ontoportal-lirmm/goo', branch: 'development'
 gem 'sparql-client', github: 'ontoportal-lirmm/sparql-client', branch: 'master'
+
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -246,6 +246,7 @@ DEPENDENCIES
   multi_json (~> 1.0)
   oj (~> 2.0)
   omni_logger
+  parallel (~> 1.24)
   pony
   pry
   rack (~> 1.0)

diff --git a/lib/ontologies_linked_data/concerns/mappings/mapping_external.rb b/lib/ontologies_linked_data/concerns/mappings/mapping_external.rb
diff --git a/lib/ontologies_linked_data/concerns/ontology_submissions/submission_index_all_data.rb b/lib/ontologies_linked_data/concerns/ontology_submissions/submission_index_all_data.rb
@@ -0,0 +1,161 @@
+require 'parallel'
+module LinkedData
+  module Concerns
+    module OntologySubmission
+      module IndexAllData
+
+        module ClassMethods
+          def clear_indexed_content(ontology)
+            conn = Goo.init_search_connection(:ontology_data)
+            begin
+              conn.delete_by_query("ontology_t:\"#{ontology}\"")
+            rescue StandardError => e
+              puts e.message
+            end
+            conn
+          end
+
+        end
+
+        def self.included(base)
+          base.extend(ClassMethods)
+        end
+
+        def index_sorted_ids(ids, ontology, conn, logger, commit = true)
+          total_triples = Parallel.map(ids.each_slice(100), in_threads: 10) do |ids_slice|
+            index_ids = 0
+            triples_count = 0
+            documents = {}
+            time = Benchmark.realtime do
+              documents, triples_count = fetch_triples(ids_slice, ontology)
+            end
+
+            return if documents.empty?
+
+            logger.info("Worker #{Parallel.worker_number} > Fetched #{triples_count} triples of #{id} in #{time} sec.") if triples_count.positive?
+
+            time = Benchmark.realtime do
+              conn.index_document(documents.values, commit: false)
+              conn.index_commit if commit
+              index_ids = documents.size
+              documents = {}
+            end
+            logger.info("Worker #{Parallel.worker_number} > Indexed #{index_ids} ids of #{id} in #{time} sec. Total #{documents.size} ids.")
+            triples_count
+          end
+          total_triples.sum
+        end
+
+        def index_all_data(logger, commit = true)
+          page = 1
+          size = 1000
+          count_ids = 0
+          total_time = 0
+          total_triples = 0
+          old_count = -1
+
+          ontology = self.bring(:ontology).ontology
+                         .bring(:acronym).acronym
+          conn = init_search_collection(ontology)
+
+          ids = {}
+
+          while count_ids != old_count
+            old_count = count_ids
+            count = 0
+            time = Benchmark.realtime do
+              ids = fetch_sorted_ids(size, page)
+              count = ids.size
+            end
+
+            count_ids += count
+            total_time += time
+            page += 1
+
+            next unless count.positive?
+
+            logger.info("Fetched #{count} ids of #{id} page: #{page} in #{time} sec.")
+
+            total_triples += index_sorted_ids(ids, ontology, conn, logger, commit)
+
+          end
+          logger.info("Completed indexing all ontology data: #{self.id} in #{total_time} sec. (#{count_ids} ids / #{total_triples} triples)")
+          logger.flush
+        end
+
+        private
+
+        def fetch_sorted_ids(size, page)
+          query = Goo.sparql_query_client.select(:id)
+                     .distinct
+                     .from(RDF::URI.new(self.id))
+                     .where(%i[id p v])
+                     .limit(size)
+                     .offset((page - 1) * size)
+
+          query.each_solution.map(&:id).sort
+        end
+
+        def update_doc(doc, property, new_val)
+          unescaped_prop = property.gsub('___', '://')
+
+          unescaped_prop = unescaped_prop.gsub('_', '/')
+          existent_val = doc["#{unescaped_prop}_t"] || doc["#{unescaped_prop}_txt"]
+
+          if !existent_val && !property['#']
+            unescaped_prop = unescaped_prop.sub(%r{/([^/]+)$}, '#\1') # change latest '/' with '#'
+            existent_val = doc["#{unescaped_prop}_t"] || doc["#{unescaped_prop}_txt"]
+          end
+
+          if existent_val && new_val || new_val.is_a?(Array)
+            doc.delete("#{unescaped_prop}_t")
+            doc["#{unescaped_prop}_txt"] = Array(existent_val) + Array(new_val).map(&:to_s)
+          elsif existent_val.nil? && new_val
+            doc["#{unescaped_prop}_t"] = new_val.to_s
+          end
+          doc
+        end
+
+        def init_search_collection(ontology)
+          self.class.clear_indexed_content(ontology)
+        end
+
+        def fetch_triples(ids_slice, ontology)
+          documents = {}
+          count = 0
+          filter = ids_slice.map { |x| "?id = <#{x}>" }.join(' || ')
+          query = Goo.sparql_query_client.select(:id, :p, :v)
+                     .from(RDF::URI.new(self.id))
+                     .where(%i[id p v])
+                     .filter(filter)
+          query.each_solution do |sol|
+            count += 1
+            doc = documents[sol[:id].to_s]
+            doc ||= {
+              id: "#{sol[:id]}_#{ontology}", submission_id_t: self.id.to_s,
+              ontology_t: ontology, resource_model: self.class.model_name,
+              resource_id: sol[:id].to_s
+            }
+            property = sol[:p].to_s
+            value = sol[:v]
+
+            if property.to_s.eql?(RDF.type.to_s)
+              update_doc(doc, 'type', value)
+            else
+              update_doc(doc, property, value)
+            end
+            documents[sol[:id].to_s] = doc
+          end
+          [documents, count]
+        end
+
+      end
+    end
+  end
+end
+
+
+
+
+
+
diff --git a/lib/ontologies_linked_data/models/agents/agent.rb b/lib/ontologies_linked_data/models/agents/agent.rb
@@ -7,13 +7,13 @@ class Agent < LinkedData::Models::Base
 
       model :Agent, namespace: :foaf, name_with: lambda { |cc| uuid_uri_generator(cc) }
       attribute :agentType, enforce: [:existence], enforcedValues: %w[person organization]
-      attribute :name, namespace: :foaf, enforce: %i[existence]
+      attribute :name, namespace: :foaf, enforce: %i[existence], fuzzy_search: true
 
       attribute :homepage, namespace: :foaf
-      attribute :acronym, namespace: :skos, property: :altLabel
-      attribute :email, namespace: :foaf, property: :mbox, enforce: %i[email unique]
+      attribute :acronym, namespace: :skos, property: :altLabel, fuzzy_search: true
+      attribute :email, namespace: :foaf, property: :mbox, enforce: %i[email unique], fuzzy_search: true
 
-      attribute :identifiers, namespace: :adms, property: :identifier, enforce: %i[Identifier list unique_identifiers]
+      attribute :identifiers, namespace: :adms, property: :identifier, enforce: %i[Identifier list unique_identifiers], fuzzy_search: true
       attribute :affiliations, enforce: %i[Agent list is_organization], namespace: :org, property: :memberOf
       attribute :creator, type: :user, enforce: [:existence]
       embed :identifiers, :affiliations
@@ -23,6 +23,11 @@ class Agent < LinkedData::Models::Base
       write_access :creator
       access_control_load :creator
 
+      enable_indexing(:agents_metadata)
+
+      def embedded_doc
+        "#{self.name} #{self.acronym} #{self.email} #{self.agentType}"
+      end
 
       def self.load_agents_usages(agents = [], agent_attributes =  OntologySubmission.agents_attr_uris)
         q = Goo.sparql_query_client.select(:id, :property, :agent, :status).distinct.from(LinkedData::Models::OntologySubmission.uri_type).where([:id,LinkedData::Models::OntologySubmission.attribute_uri(:submissionStatus),:status], [:id, :property, :agent])

diff --git a/lib/ontologies_linked_data/models/agents/identifier.rb b/lib/ontologies_linked_data/models/agents/identifier.rb
@@ -21,6 +21,10 @@ def self.generate_identifier(notation, schema_agency)
         return RDF::URI.new(Goo.id_prefix + 'Identifiers/' + out.join(':')) if out.size.eql?(2)
       end
 
+      def embedded_doc
+        "#{self.id.split('/').last}"
+      end
+
       def no_url(inst,attr)
         inst.bring(attr) if inst.bring?(attr)
         notation = inst.send(attr)

diff --git a/lib/ontologies_linked_data/models/contact.rb b/lib/ontologies_linked_data/models/contact.rb
@@ -6,6 +6,11 @@ class Contact < LinkedData::Models::Base
       attribute :email, enforce: [:existence]
 
       embedded true
+
+      def embedded_doc
+        "#{self.name} #{self.email}"
+      end
+
     end
   end
 end
diff --git a/lib/ontologies_linked_data/models/ontology.rb b/lib/ontologies_linked_data/models/ontology.rb
@@ -26,8 +26,8 @@ class OntologyAnalyticsError < StandardError; end
 
       model :ontology, :name_with => :acronym
       attribute :acronym, namespace: :omv,
-        enforce: [:unique, :existence, lambda { |inst,attr| validate_acronym(inst,attr) } ]
-      attribute :name, :namespace => :omv, enforce: [:unique, :existence]
+        enforce: [:unique, :existence, lambda { |inst,attr| validate_acronym(inst,attr) } ], fuzzy_search: true
+      attribute :name, :namespace => :omv, enforce: [:unique, :existence], fuzzy_search: true
       attribute :submissions, inverse: { on: :ontology_submission, attribute: :ontology },
                 metadataMappings: ["dct:hasVersion", "pav:hasCurrentVersion", "pav:hasVersion", "prov:generalizationOf", "adms:next"]
       attribute :projects,
@@ -88,6 +88,10 @@ class OntologyAnalyticsError < StandardError; end
       # Cache
       cache_timeout 3600
 
+      enable_indexing(:ontology_metadata)
+
+      after_save :index_latest_submission
+
       def self.validate_acronym(inst, attr)
         inst.bring(attr) if inst.bring?(attr)
         acronym = inst.send(attr)
@@ -420,9 +424,8 @@ def delete(*args)
         end
 
         # remove index entries
-        unindex(index_commit)
-        unindex_properties(index_commit)
-
+        unindex_all_data(index_commit)
+
         # delete all files
         ontology_dir = File.join(LinkedData.settings.repository_folder, self.acronym.to_s)
         FileUtils.rm_rf(ontology_dir)
@@ -443,8 +446,28 @@ def save(*args)
         self
       end
 
-      def unindex(commit=true)
+      def index_latest_submission
+        last_s = latest_submission(status: :any)
+        return if last_s.nil?
+
+        last_s.ontology = self
+        last_s.index_update([:ontology])
+      end
+
+      def unindex_all_data(commit=true)
         unindex_by_acronym(commit)
+        unindex_properties(commit)
+      end
+
+      def embedded_doc
+        self.administeredBy.map{|x| x.bring_remaining}
+        doc = indexable_object
+        doc.delete(:id)
+        doc.delete(:resource_id)
+        doc.delete('ontology_viewOf_resource_model_t')
+        doc['ontology_viewOf_t'] = self.viewOf.id.to_s  unless self.viewOf.nil?
+        doc[:resource_model_t] = doc.delete(:resource_model)
+        doc
       end
 
       def unindex_properties(commit=true)
@@ -459,6 +482,7 @@ def unindex_by_acronym(commit=true)
         query = "submissionAcronym:#{acronym}"
         Class.unindexByQuery(query)
         Class.indexCommit(nil) if commit
+        OntologySubmission.clear_indexed_content(acronym)
       end
 
       def restricted?