Skip to content

Commit

Permalink
Feature: Index Ontologies metadata and content & Agents (#130)
Browse files Browse the repository at this point in the history
* use standard SOLR in docker compose with no ontoportal old confgis

* migrate ontology properties SOLR configuration to use Schema API

* migrate ontology classes SOLR configuration to use Schema API

* migrate provisional classes indexation to use Schema API and model hooks

* update tests to handle the new indexation API

* simplify the ontology properties index schema

* update class and properties schema to use the existent dynamic names

* index submission and ontologies metadata on save

* index agents metadata

* add ontology and agent metadata  indexation tests

* make agent, name , acronym, email and identifiers searchable

* unindex ontology submission when archived

* make ontology acronym and name searchable

* update embedded ontology to all the fields and update submission in save

* fix embed docs search tests

* rename ontology unindex to unindex_all_data to prevent conflicts

* implement index all ontology content

* fix unescaping indexed properties naming

* fix an issue after update RDF gem to 3.0 that frozen request params

* add parallel processing the index_all_data step

* clear indexed data after ontology delete

* optimize index all data in Virtuoso and GraphDb by pre-fetching all ids

- Before optimization
    - fs ⇒ 15.224490000051446s
    - ag ⇒ 19.238805999979377s
    - vo ⇒ 42.95274499990046s
    - gb ⇒ 33.52821200003382s
- After optimization
    - fs ⇒ 15.369778999942355s
    - ag ⇒ 17.367580000078306s
    - vo ⇒ 16.564614000031725s
    - gb ⇒ 15.431716999970376s
  • Loading branch information
syphax-bouazzouni authored Mar 3, 2024
1 parent cab072e commit d37aeaf
Show file tree
Hide file tree
Showing 12 changed files with 496 additions and 140 deletions.
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ gem 'rubyzip', '~> 1.0'
gem 'thin'
gem 'request_store'
gem 'jwt'
gem "parallel", "~> 1.24"

# Testing
group :test do
Expand All @@ -39,3 +40,4 @@ end
# NCBO gems (can be from a local dev path or from rubygems/git)
gem 'goo', github: 'ontoportal-lirmm/goo', branch: 'development'
gem 'sparql-client', github: 'ontoportal-lirmm/sparql-client', branch: 'master'

1 change: 1 addition & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ DEPENDENCIES
multi_json (~> 1.0)
oj (~> 2.0)
omni_logger
parallel (~> 1.24)
pony
pry
rack (~> 1.0)
Expand Down
11 changes: 0 additions & 11 deletions lib/ontologies_linked_data/concerns/mappings/mapping_external.rb

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
require 'parallel'
module LinkedData
module Concerns
module OntologySubmission
module IndexAllData

module ClassMethods
def clear_indexed_content(ontology)
conn = Goo.init_search_connection(:ontology_data)
begin
conn.delete_by_query("ontology_t:\"#{ontology}\"")
rescue StandardError => e
puts e.message
end
conn
end

end

def self.included(base)
base.extend(ClassMethods)
end

def index_sorted_ids(ids, ontology, conn, logger, commit = true)
total_triples = Parallel.map(ids.each_slice(100), in_threads: 10) do |ids_slice|
index_ids = 0
triples_count = 0
documents = {}
time = Benchmark.realtime do
documents, triples_count = fetch_triples(ids_slice, ontology)
end

return if documents.empty?

logger.info("Worker #{Parallel.worker_number} > Fetched #{triples_count} triples of #{id} in #{time} sec.") if triples_count.positive?

time = Benchmark.realtime do
conn.index_document(documents.values, commit: false)
conn.index_commit if commit
index_ids = documents.size
documents = {}
end
logger.info("Worker #{Parallel.worker_number} > Indexed #{index_ids} ids of #{id} in #{time} sec. Total #{documents.size} ids.")
triples_count
end
total_triples.sum
end

def index_all_data(logger, commit = true)
page = 1
size = 1000
count_ids = 0
total_time = 0
total_triples = 0
old_count = -1

ontology = self.bring(:ontology).ontology
.bring(:acronym).acronym
conn = init_search_collection(ontology)

ids = {}

while count_ids != old_count
old_count = count_ids
count = 0
time = Benchmark.realtime do
ids = fetch_sorted_ids(size, page)
count = ids.size
end

count_ids += count
total_time += time
page += 1

next unless count.positive?

logger.info("Fetched #{count} ids of #{id} page: #{page} in #{time} sec.")

total_triples += index_sorted_ids(ids, ontology, conn, logger, commit)

end
logger.info("Completed indexing all ontology data: #{self.id} in #{total_time} sec. (#{count_ids} ids / #{total_triples} triples)")
logger.flush
end

private

def fetch_sorted_ids(size, page)
query = Goo.sparql_query_client.select(:id)
.distinct
.from(RDF::URI.new(self.id))
.where(%i[id p v])
.limit(size)
.offset((page - 1) * size)

query.each_solution.map(&:id).sort
end

def update_doc(doc, property, new_val)
unescaped_prop = property.gsub('___', '://')

unescaped_prop = unescaped_prop.gsub('_', '/')
existent_val = doc["#{unescaped_prop}_t"] || doc["#{unescaped_prop}_txt"]

if !existent_val && !property['#']
unescaped_prop = unescaped_prop.sub(%r{/([^/]+)$}, '#\1') # change latest '/' with '#'
existent_val = doc["#{unescaped_prop}_t"] || doc["#{unescaped_prop}_txt"]
end

if existent_val && new_val || new_val.is_a?(Array)
doc.delete("#{unescaped_prop}_t")
doc["#{unescaped_prop}_txt"] = Array(existent_val) + Array(new_val).map(&:to_s)
elsif existent_val.nil? && new_val
doc["#{unescaped_prop}_t"] = new_val.to_s
end
doc
end

def init_search_collection(ontology)
self.class.clear_indexed_content(ontology)
end

def fetch_triples(ids_slice, ontology)
documents = {}
count = 0
filter = ids_slice.map { |x| "?id = <#{x}>" }.join(' || ')
query = Goo.sparql_query_client.select(:id, :p, :v)
.from(RDF::URI.new(self.id))
.where(%i[id p v])
.filter(filter)
query.each_solution do |sol|
count += 1
doc = documents[sol[:id].to_s]
doc ||= {
id: "#{sol[:id]}_#{ontology}", submission_id_t: self.id.to_s,
ontology_t: ontology, resource_model: self.class.model_name,
resource_id: sol[:id].to_s
}
property = sol[:p].to_s
value = sol[:v]

if property.to_s.eql?(RDF.type.to_s)
update_doc(doc, 'type', value)
else
update_doc(doc, property, value)
end
documents[sol[:id].to_s] = doc
end
[documents, count]
end

end
end
end
end






13 changes: 9 additions & 4 deletions lib/ontologies_linked_data/models/agents/agent.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ class Agent < LinkedData::Models::Base

model :Agent, namespace: :foaf, name_with: lambda { |cc| uuid_uri_generator(cc) }
attribute :agentType, enforce: [:existence], enforcedValues: %w[person organization]
attribute :name, namespace: :foaf, enforce: %i[existence]
attribute :name, namespace: :foaf, enforce: %i[existence], fuzzy_search: true

attribute :homepage, namespace: :foaf
attribute :acronym, namespace: :skos, property: :altLabel
attribute :email, namespace: :foaf, property: :mbox, enforce: %i[email unique]
attribute :acronym, namespace: :skos, property: :altLabel, fuzzy_search: true
attribute :email, namespace: :foaf, property: :mbox, enforce: %i[email unique], fuzzy_search: true

attribute :identifiers, namespace: :adms, property: :identifier, enforce: %i[Identifier list unique_identifiers]
attribute :identifiers, namespace: :adms, property: :identifier, enforce: %i[Identifier list unique_identifiers], fuzzy_search: true
attribute :affiliations, enforce: %i[Agent list is_organization], namespace: :org, property: :memberOf
attribute :creator, type: :user, enforce: [:existence]
embed :identifiers, :affiliations
Expand All @@ -23,6 +23,11 @@ class Agent < LinkedData::Models::Base
write_access :creator
access_control_load :creator

enable_indexing(:agents_metadata)

def embedded_doc
"#{self.name} #{self.acronym} #{self.email} #{self.agentType}"
end

def self.load_agents_usages(agents = [], agent_attributes = OntologySubmission.agents_attr_uris)
q = Goo.sparql_query_client.select(:id, :property, :agent, :status).distinct.from(LinkedData::Models::OntologySubmission.uri_type).where([:id,LinkedData::Models::OntologySubmission.attribute_uri(:submissionStatus),:status], [:id, :property, :agent])
Expand Down
4 changes: 4 additions & 0 deletions lib/ontologies_linked_data/models/agents/identifier.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ def self.generate_identifier(notation, schema_agency)
return RDF::URI.new(Goo.id_prefix + 'Identifiers/' + out.join(':')) if out.size.eql?(2)
end

def embedded_doc
"#{self.id.split('/').last}"
end

def no_url(inst,attr)
inst.bring(attr) if inst.bring?(attr)
notation = inst.send(attr)
Expand Down
5 changes: 5 additions & 0 deletions lib/ontologies_linked_data/models/contact.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ class Contact < LinkedData::Models::Base
attribute :email, enforce: [:existence]

embedded true

def embedded_doc
"#{self.name} #{self.email}"
end

end
end
end
36 changes: 30 additions & 6 deletions lib/ontologies_linked_data/models/ontology.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ class OntologyAnalyticsError < StandardError; end

model :ontology, :name_with => :acronym
attribute :acronym, namespace: :omv,
enforce: [:unique, :existence, lambda { |inst,attr| validate_acronym(inst,attr) } ]
attribute :name, :namespace => :omv, enforce: [:unique, :existence]
enforce: [:unique, :existence, lambda { |inst,attr| validate_acronym(inst,attr) } ], fuzzy_search: true
attribute :name, :namespace => :omv, enforce: [:unique, :existence], fuzzy_search: true
attribute :submissions, inverse: { on: :ontology_submission, attribute: :ontology },
metadataMappings: ["dct:hasVersion", "pav:hasCurrentVersion", "pav:hasVersion", "prov:generalizationOf", "adms:next"]
attribute :projects,
Expand Down Expand Up @@ -88,6 +88,10 @@ class OntologyAnalyticsError < StandardError; end
# Cache
cache_timeout 3600

enable_indexing(:ontology_metadata)

after_save :index_latest_submission

def self.validate_acronym(inst, attr)
inst.bring(attr) if inst.bring?(attr)
acronym = inst.send(attr)
Expand Down Expand Up @@ -420,9 +424,8 @@ def delete(*args)
end

# remove index entries
unindex(index_commit)
unindex_properties(index_commit)

unindex_all_data(index_commit)

# delete all files
ontology_dir = File.join(LinkedData.settings.repository_folder, self.acronym.to_s)
FileUtils.rm_rf(ontology_dir)
Expand All @@ -443,8 +446,28 @@ def save(*args)
self
end

def unindex(commit=true)
def index_latest_submission
last_s = latest_submission(status: :any)
return if last_s.nil?

last_s.ontology = self
last_s.index_update([:ontology])
end

def unindex_all_data(commit=true)
unindex_by_acronym(commit)
unindex_properties(commit)
end

def embedded_doc
self.administeredBy.map{|x| x.bring_remaining}
doc = indexable_object
doc.delete(:id)
doc.delete(:resource_id)
doc.delete('ontology_viewOf_resource_model_t')
doc['ontology_viewOf_t'] = self.viewOf.id.to_s unless self.viewOf.nil?
doc[:resource_model_t] = doc.delete(:resource_model)
doc
end

def unindex_properties(commit=true)
Expand All @@ -459,6 +482,7 @@ def unindex_by_acronym(commit=true)
query = "submissionAcronym:#{acronym}"
Class.unindexByQuery(query)
Class.indexCommit(nil) if commit
OntologySubmission.clear_indexed_content(acronym)
end

def restricted?
Expand Down
Loading

0 comments on commit d37aeaf

Please sign in to comment.