diff --git a/Gemfile b/Gemfile index 7e1f17c7..c8c821a3 100644 --- a/Gemfile +++ b/Gemfile @@ -21,6 +21,7 @@ gem 'rubyzip', '~> 1.0' gem 'thin' gem 'request_store' gem 'jwt' +gem "parallel", "~> 1.24" # Testing group :test do @@ -39,3 +40,4 @@ end # NCBO gems (can be from a local dev path or from rubygems/git) gem 'goo', github: 'ontoportal-lirmm/goo', branch: 'development' gem 'sparql-client', github: 'ontoportal-lirmm/sparql-client', branch: 'master' + diff --git a/Gemfile.lock b/Gemfile.lock index 3035e28b..36386a5c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -246,6 +246,7 @@ DEPENDENCIES multi_json (~> 1.0) oj (~> 2.0) omni_logger + parallel (~> 1.24) pony pry rack (~> 1.0) diff --git a/lib/ontologies_linked_data/concerns/mappings/mapping_external.rb b/lib/ontologies_linked_data/concerns/mappings/mapping_external.rb deleted file mode 100644 index 08717380..00000000 --- a/lib/ontologies_linked_data/concerns/mappings/mapping_external.rb +++ /dev/null @@ -1,11 +0,0 @@ -module LinkedData - module Concerns - module Mappings - module ExternalUtils - - - end - end - end -end - diff --git a/lib/ontologies_linked_data/concerns/ontology_submissions/submission_index_all_data.rb b/lib/ontologies_linked_data/concerns/ontology_submissions/submission_index_all_data.rb new file mode 100644 index 00000000..fc59de45 --- /dev/null +++ b/lib/ontologies_linked_data/concerns/ontology_submissions/submission_index_all_data.rb @@ -0,0 +1,161 @@ +require 'parallel' +module LinkedData + module Concerns + module OntologySubmission + module IndexAllData + + module ClassMethods + def clear_indexed_content(ontology) + conn = Goo.init_search_connection(:ontology_data) + begin + conn.delete_by_query("ontology_t:\"#{ontology}\"") + rescue StandardError => e + puts e.message + end + conn + end + + end + + def self.included(base) + base.extend(ClassMethods) + end + + def index_sorted_ids(ids, ontology, conn, logger, commit = true) + total_triples = Parallel.map(ids.each_slice(100), in_threads: 10) do |ids_slice| + index_ids = 0 + triples_count = 0 + documents = {} + time = Benchmark.realtime do + documents, triples_count = fetch_triples(ids_slice, ontology) + end + + return if documents.empty? + + logger.info("Worker #{Parallel.worker_number} > Fetched #{triples_count} triples of #{id} in #{time} sec.") if triples_count.positive? + + time = Benchmark.realtime do + conn.index_document(documents.values, commit: false) + conn.index_commit if commit + index_ids = documents.size + documents = {} + end + logger.info("Worker #{Parallel.worker_number} > Indexed #{index_ids} ids of #{id} in #{time} sec. Total #{documents.size} ids.") + triples_count + end + total_triples.sum + end + + def index_all_data(logger, commit = true) + page = 1 + size = 1000 + count_ids = 0 + total_time = 0 + total_triples = 0 + old_count = -1 + + ontology = self.bring(:ontology).ontology + .bring(:acronym).acronym + conn = init_search_collection(ontology) + + ids = {} + + while count_ids != old_count + old_count = count_ids + count = 0 + time = Benchmark.realtime do + ids = fetch_sorted_ids(size, page) + count = ids.size + end + + count_ids += count + total_time += time + page += 1 + + next unless count.positive? + + logger.info("Fetched #{count} ids of #{id} page: #{page} in #{time} sec.") + + total_triples += index_sorted_ids(ids, ontology, conn, logger, commit) + + end + logger.info("Completed indexing all ontology data: #{self.id} in #{total_time} sec. (#{count_ids} ids / #{total_triples} triples)") + logger.flush + end + + private + + def fetch_sorted_ids(size, page) + query = Goo.sparql_query_client.select(:id) + .distinct + .from(RDF::URI.new(self.id)) + .where(%i[id p v]) + .limit(size) + .offset((page - 1) * size) + + query.each_solution.map(&:id).sort + end + + def update_doc(doc, property, new_val) + unescaped_prop = property.gsub('___', '://') + + unescaped_prop = unescaped_prop.gsub('_', '/') + existent_val = doc["#{unescaped_prop}_t"] || doc["#{unescaped_prop}_txt"] + + if !existent_val && !property['#'] + unescaped_prop = unescaped_prop.sub(%r{/([^/]+)$}, '#\1') # change latest '/' with '#' + existent_val = doc["#{unescaped_prop}_t"] || doc["#{unescaped_prop}_txt"] + end + + if existent_val && new_val || new_val.is_a?(Array) + doc.delete("#{unescaped_prop}_t") + doc["#{unescaped_prop}_txt"] = Array(existent_val) + Array(new_val).map(&:to_s) + elsif existent_val.nil? && new_val + doc["#{unescaped_prop}_t"] = new_val.to_s + end + doc + end + + def init_search_collection(ontology) + self.class.clear_indexed_content(ontology) + end + + def fetch_triples(ids_slice, ontology) + documents = {} + count = 0 + filter = ids_slice.map { |x| "?id = <#{x}>" }.join(' || ') + query = Goo.sparql_query_client.select(:id, :p, :v) + .from(RDF::URI.new(self.id)) + .where(%i[id p v]) + .filter(filter) + query.each_solution do |sol| + count += 1 + doc = documents[sol[:id].to_s] + doc ||= { + id: "#{sol[:id]}_#{ontology}", submission_id_t: self.id.to_s, + ontology_t: ontology, resource_model: self.class.model_name, + resource_id: sol[:id].to_s + } + property = sol[:p].to_s + value = sol[:v] + + if property.to_s.eql?(RDF.type.to_s) + update_doc(doc, 'type', value) + else + update_doc(doc, property, value) + end + documents[sol[:id].to_s] = doc + end + [documents, count] + end + + end + end + end +end + + + + + + diff --git a/lib/ontologies_linked_data/models/agents/agent.rb b/lib/ontologies_linked_data/models/agents/agent.rb index c31921a2..24601748 100644 --- a/lib/ontologies_linked_data/models/agents/agent.rb +++ b/lib/ontologies_linked_data/models/agents/agent.rb @@ -7,13 +7,13 @@ class Agent < LinkedData::Models::Base model :Agent, namespace: :foaf, name_with: lambda { |cc| uuid_uri_generator(cc) } attribute :agentType, enforce: [:existence], enforcedValues: %w[person organization] - attribute :name, namespace: :foaf, enforce: %i[existence] + attribute :name, namespace: :foaf, enforce: %i[existence], fuzzy_search: true attribute :homepage, namespace: :foaf - attribute :acronym, namespace: :skos, property: :altLabel - attribute :email, namespace: :foaf, property: :mbox, enforce: %i[email unique] + attribute :acronym, namespace: :skos, property: :altLabel, fuzzy_search: true + attribute :email, namespace: :foaf, property: :mbox, enforce: %i[email unique], fuzzy_search: true - attribute :identifiers, namespace: :adms, property: :identifier, enforce: %i[Identifier list unique_identifiers] + attribute :identifiers, namespace: :adms, property: :identifier, enforce: %i[Identifier list unique_identifiers], fuzzy_search: true attribute :affiliations, enforce: %i[Agent list is_organization], namespace: :org, property: :memberOf attribute :creator, type: :user, enforce: [:existence] embed :identifiers, :affiliations @@ -23,6 +23,11 @@ class Agent < LinkedData::Models::Base write_access :creator access_control_load :creator + enable_indexing(:agents_metadata) + + def embedded_doc + "#{self.name} #{self.acronym} #{self.email} #{self.agentType}" + end def self.load_agents_usages(agents = [], agent_attributes = OntologySubmission.agents_attr_uris) q = Goo.sparql_query_client.select(:id, :property, :agent, :status).distinct.from(LinkedData::Models::OntologySubmission.uri_type).where([:id,LinkedData::Models::OntologySubmission.attribute_uri(:submissionStatus),:status], [:id, :property, :agent]) diff --git a/lib/ontologies_linked_data/models/agents/identifier.rb b/lib/ontologies_linked_data/models/agents/identifier.rb index 7f504456..5e7d77cc 100644 --- a/lib/ontologies_linked_data/models/agents/identifier.rb +++ b/lib/ontologies_linked_data/models/agents/identifier.rb @@ -21,6 +21,10 @@ def self.generate_identifier(notation, schema_agency) return RDF::URI.new(Goo.id_prefix + 'Identifiers/' + out.join(':')) if out.size.eql?(2) end + def embedded_doc + "#{self.id.split('/').last}" + end + def no_url(inst,attr) inst.bring(attr) if inst.bring?(attr) notation = inst.send(attr) diff --git a/lib/ontologies_linked_data/models/contact.rb b/lib/ontologies_linked_data/models/contact.rb index 9af31a95..d06ed64e 100644 --- a/lib/ontologies_linked_data/models/contact.rb +++ b/lib/ontologies_linked_data/models/contact.rb @@ -6,6 +6,11 @@ class Contact < LinkedData::Models::Base attribute :email, enforce: [:existence] embedded true + + def embedded_doc + "#{self.name} #{self.email}" + end + end end end diff --git a/lib/ontologies_linked_data/models/ontology.rb b/lib/ontologies_linked_data/models/ontology.rb index a1f17271..442eb868 100644 --- a/lib/ontologies_linked_data/models/ontology.rb +++ b/lib/ontologies_linked_data/models/ontology.rb @@ -26,8 +26,8 @@ class OntologyAnalyticsError < StandardError; end model :ontology, :name_with => :acronym attribute :acronym, namespace: :omv, - enforce: [:unique, :existence, lambda { |inst,attr| validate_acronym(inst,attr) } ] - attribute :name, :namespace => :omv, enforce: [:unique, :existence] + enforce: [:unique, :existence, lambda { |inst,attr| validate_acronym(inst,attr) } ], fuzzy_search: true + attribute :name, :namespace => :omv, enforce: [:unique, :existence], fuzzy_search: true attribute :submissions, inverse: { on: :ontology_submission, attribute: :ontology }, metadataMappings: ["dct:hasVersion", "pav:hasCurrentVersion", "pav:hasVersion", "prov:generalizationOf", "adms:next"] attribute :projects, @@ -88,6 +88,10 @@ class OntologyAnalyticsError < StandardError; end # Cache cache_timeout 3600 + enable_indexing(:ontology_metadata) + + after_save :index_latest_submission + def self.validate_acronym(inst, attr) inst.bring(attr) if inst.bring?(attr) acronym = inst.send(attr) @@ -420,9 +424,8 @@ def delete(*args) end # remove index entries - unindex(index_commit) - unindex_properties(index_commit) - + unindex_all_data(index_commit) + # delete all files ontology_dir = File.join(LinkedData.settings.repository_folder, self.acronym.to_s) FileUtils.rm_rf(ontology_dir) @@ -443,8 +446,28 @@ def save(*args) self end - def unindex(commit=true) + def index_latest_submission + last_s = latest_submission(status: :any) + return if last_s.nil? + + last_s.ontology = self + last_s.index_update([:ontology]) + end + + def unindex_all_data(commit=true) unindex_by_acronym(commit) + unindex_properties(commit) + end + + def embedded_doc + self.administeredBy.map{|x| x.bring_remaining} + doc = indexable_object + doc.delete(:id) + doc.delete(:resource_id) + doc.delete('ontology_viewOf_resource_model_t') + doc['ontology_viewOf_t'] = self.viewOf.id.to_s unless self.viewOf.nil? + doc[:resource_model_t] = doc.delete(:resource_model) + doc end def unindex_properties(commit=true) @@ -459,6 +482,7 @@ def unindex_by_acronym(commit=true) query = "submissionAcronym:#{acronym}" Class.unindexByQuery(query) Class.indexCommit(nil) if commit + OntologySubmission.clear_indexed_content(acronym) end def restricted? diff --git a/lib/ontologies_linked_data/models/ontology_submission.rb b/lib/ontologies_linked_data/models/ontology_submission.rb index c901b32e..da9f3266 100644 --- a/lib/ontologies_linked_data/models/ontology_submission.rb +++ b/lib/ontologies_linked_data/models/ontology_submission.rb @@ -13,6 +13,7 @@ module Models class OntologySubmission < LinkedData::Models::Base include LinkedData::Concerns::OntologySubmission::MetadataExtractor + include LinkedData::Concerns::OntologySubmission::IndexAllData include LinkedData::Concerns::OntologySubmission::Validators include LinkedData::Concerns::OntologySubmission::UpdateCallbacks extend LinkedData::Concerns::OntologySubmission::DefaultCallbacks @@ -26,39 +27,39 @@ class OntologySubmission < LinkedData::Models::Base FILE_SIZE_ZIPPING_THRESHOLD = 100 * 1024 * 1024 # 100MB model :ontology_submission, scheme: File.join(__dir__, '../../../config/schemes/ontology_submission.yml'), - name_with: ->(s) { submission_id_generator(s) } + name_with: ->(s) { submission_id_generator(s) } attribute :submissionId, type: :integer, enforce: [:existence] # Object description properties metadata # Configurable properties for processing - attribute :prefLabelProperty, type: :uri, default: ->(s) {Goo.vocabulary(:skos)[:prefLabel]} - attribute :definitionProperty, type: :uri, default: ->(s) {Goo.vocabulary(:skos)[:definition]} - attribute :synonymProperty, type: :uri, default: ->(s) {Goo.vocabulary(:skos)[:altLabel]} - attribute :authorProperty, type: :uri, default: ->(s) {Goo.vocabulary(:dc)[:creator]} + attribute :prefLabelProperty, type: :uri, default: ->(s) { Goo.vocabulary(:skos)[:prefLabel] } + attribute :definitionProperty, type: :uri, default: ->(s) { Goo.vocabulary(:skos)[:definition] } + attribute :synonymProperty, type: :uri, default: ->(s) { Goo.vocabulary(:skos)[:altLabel] } + attribute :authorProperty, type: :uri, default: ->(s) { Goo.vocabulary(:dc)[:creator] } attribute :classType, type: :uri - attribute :hierarchyProperty, type: :uri, default: ->(s) {default_hierarchy_property(s)} - attribute :obsoleteProperty, type: :uri, default: ->(s) {Goo.vocabulary(:owl)[:deprecated]} - attribute :obsoleteParent, type: :uri, default: ->(s) {RDF::URI.new("http://www.geneontology.org/formats/oboInOwl#ObsoleteClass")} - attribute :createdProperty, type: :uri, default: ->(s) {Goo.vocabulary(:dc)[:created]} - attribute :modifiedProperty, type: :uri, default: ->(s) {Goo.vocabulary(:dc)[:modified]} + attribute :hierarchyProperty, type: :uri, default: ->(s) { default_hierarchy_property(s) } + attribute :obsoleteProperty, type: :uri, default: ->(s) { Goo.vocabulary(:owl)[:deprecated] } + attribute :obsoleteParent, type: :uri, default: ->(s) { RDF::URI.new("http://www.geneontology.org/formats/oboInOwl#ObsoleteClass") } + attribute :createdProperty, type: :uri, default: ->(s) { Goo.vocabulary(:dc)[:created] } + attribute :modifiedProperty, type: :uri, default: ->(s) { Goo.vocabulary(:dc)[:modified] } # Ontology metadata # General metadata - attribute :URI, namespace: :omv, type: :uri, enforce: %i[existence distinct_of_identifier] + attribute :URI, namespace: :omv, type: :uri, enforce: %i[existence distinct_of_identifier], fuzzy_search: true attribute :versionIRI, namespace: :owl, type: :uri, enforce: [:distinct_of_URI] attribute :version, namespace: :omv attribute :status, namespace: :omv, enforce: %i[existence], default: ->(x) { 'production' } attribute :deprecated, namespace: :owl, type: :boolean, default: ->(x) { false } attribute :hasOntologyLanguage, namespace: :omv, type: :ontology_format, enforce: [:existence] attribute :hasFormalityLevel, namespace: :omv, type: :uri - attribute :hasOntologySyntax, namespace: :omv, type: :uri, default: ->(s) {ontology_syntax_default(s)} + attribute :hasOntologySyntax, namespace: :omv, type: :uri, default: ->(s) { ontology_syntax_default(s) } attribute :naturalLanguage, namespace: :omv, type: %i[list uri], enforce: [:lexvo_language] attribute :isOfType, namespace: :omv, type: :uri attribute :identifier, namespace: :dct, type: %i[list uri], enforce: [:distinct_of_URI] # Description metadata - attribute :description, namespace: :omv, enforce: %i[concatenate existence] + attribute :description, namespace: :omv, enforce: %i[concatenate existence], fuzzy_search: true attribute :homepage, namespace: :foaf, type: :uri attribute :documentation, namespace: :omv, type: :uri attribute :notes, namespace: :omv, type: :list @@ -102,7 +103,7 @@ class OntologySubmission < LinkedData::Models::Base # Usage metadata attribute :knownUsage, namespace: :omv, type: :list attribute :designedForOntologyTask, namespace: :omv, type: %i[list uri] - attribute :hasDomain, namespace: :omv, type: :list, default: ->(s) {ontology_has_domain(s)} + attribute :hasDomain, namespace: :omv, type: :list, default: ->(s) { ontology_has_domain(s) } attribute :coverage, namespace: :dct attribute :example, namespace: :vann, type: :list @@ -121,10 +122,10 @@ class OntologySubmission < LinkedData::Models::Base attribute :pullLocation, type: :uri # URI for pulling ontology attribute :isFormatOf, namespace: :dct, type: :uri attribute :hasFormat, namespace: :dct, type: %i[uri list] - attribute :dataDump, namespace: :void, type: :uri, default: -> (s) {data_dump_default(s)} - attribute :csvDump, type: :uri, default: -> (s) {csv_dump_default(s)} - attribute :uriLookupEndpoint, namespace: :void, type: :uri, default: -> (s) {uri_lookup_default(s)} - attribute :openSearchDescription, namespace: :void, type: :uri, default: -> (s) {open_search_default(s)} + attribute :dataDump, namespace: :void, type: :uri, default: -> (s) { data_dump_default(s) } + attribute :csvDump, type: :uri, default: -> (s) { csv_dump_default(s) } + attribute :uriLookupEndpoint, namespace: :void, type: :uri, default: -> (s) { uri_lookup_default(s) } + attribute :openSearchDescription, namespace: :void, type: :uri, default: -> (s) { open_search_default(s) } attribute :source, namespace: :dct, type: :list attribute :endpoint, namespace: :sd, type: %i[uri list], default: ->(s) { default_sparql_endpoint(s)} @@ -176,13 +177,14 @@ class OntologySubmission < LinkedData::Models::Base # Link to ontology attribute :ontology, type: :ontology, enforce: [:existence] - def self.agents_attrs - [:hasCreator, :publisher, :copyrightHolder, :hasContributor, - :translator, :endorsedBy, :fundedBy, :curatedBy] + %i[hasCreator publisher copyrightHolder hasContributor + translator endorsedBy fundedBy curatedBy] end + # Hypermedia settings - embed *[:contact, :ontology, :metrics] + agents_attrs + embed *%i[contact ontology metrics] + agents_attrs + def self.embed_values_hash out = { submissionStatus: [:code], hasOntologyLanguage: [:acronym] @@ -191,11 +193,11 @@ def self.embed_values_hash agent_attributes = LinkedData::Models::Agent.goo_attrs_to_load + [identifiers: LinkedData::Models::AgentIdentifier.goo_attrs_to_load, affiliations: LinkedData::Models::Agent.goo_attrs_to_load] - agents_attrs.each { |k| out[k] = agent_attributes} + agents_attrs.each { |k| out[k] = agent_attributes } out end - embed_values self.embed_values_hash + embed_values self.embed_values_hash serialize_default :contact, :ontology, :hasOntologyLanguage, :released, :creationDate, :homepage, :publication, :documentation, :version, :description, :status, :submissionId @@ -215,6 +217,8 @@ def self.embed_values_hash read_restriction_based_on ->(sub) { sub.ontology } access_control_load ontology: %i[administeredBy acl viewingRestriction] + enable_indexing(:ontology_metadata) + def initialize(*args) super(*args) @mutex = Mutex.new @@ -225,7 +229,7 @@ def synchronize(&block) end def self.agents_attr_uris - agents_attrs.map{ |x| self.attribute_uri(x) } + agents_attrs.map { |x| self.attribute_uri(x) } end def self.ontology_link(m) @@ -269,12 +273,8 @@ def self.segment_instance(sub) end def self.submission_id_generator(ss) - if !ss.ontology.loaded_attributes.include?(:acronym) - ss.ontology.bring(:acronym) - end - if ss.ontology.acronym.nil? - raise ArgumentError, "Submission cannot be saved if ontology does not have acronym" - end + ss.ontology.bring(:acronym) if !ss.ontology.loaded_attributes.include?(:acronym) + raise ArgumentError, "Submission cannot be saved if ontology does not have acronym" if ss.ontology.acronym.nil? return RDF::URI.new( "#{(Goo.id_prefix)}ontologies/#{CGI.escape(ss.ontology.acronym.to_s)}/submissions/#{ss.submissionId.to_s}" ) @@ -293,9 +293,7 @@ def self.copy_file_repository(acronym, submissionId, src, filename = nil) dst = File.join([path_to_repo, name]) FileUtils.copy(src, dst) logger.debug("File created #{dst} | #{"%o" % File.stat(dst).mode} | umask: #{File.umask}") # NCBO-795 - if not File.exist? dst - raise Exception, "Unable to copy #{src} to #{dst}" - end + raise Exception, "Unable to copy #{src} to #{dst}" if not File.exist? dst return dst end @@ -344,9 +342,7 @@ def sanity_check rescue Exception => e1 sum_only = nil - if i == num_calls - raise $!, "#{$!} after retrying #{i} times...", $!.backtrace - end + raise $!, "#{$!} after retrying #{i} times...", $!.backtrace if i == num_calls end end end @@ -358,9 +354,7 @@ def sanity_check return false elsif self.pullLocation self.errors[:pullLocation] = ["File at #{self.pullLocation.to_s} does not exist"] - if self.uploadFilePath.nil? - return remote_file_exists?(self.pullLocation.to_s) - end + return remote_file_exists?(self.pullLocation.to_s) if self.uploadFilePath.nil? return true end @@ -376,12 +370,10 @@ def sanity_check self.masterFileName = LinkedData::Utils::FileHelpers.automaster(self.uploadFilePath, self.hasOntologyLanguage.file_extension) return true elsif zip and self.masterFileName.nil? - #zip and masterFileName not set. The user has to choose. - if self.errors[:uploadFilePath].nil? - self.errors[:uploadFilePath] = [] - end + # zip and masterFileName not set. The user has to choose. + self.errors[:uploadFilePath] = [] if self.errors[:uploadFilePath].nil? - #check for duplicated names + # check for duplicated names repeated_names = LinkedData::Utils::FileHelpers.repeated_names_in_file_list(files) if repeated_names.length > 0 names = repeated_names.keys.to_s @@ -390,13 +382,13 @@ def sanity_check return false end - #error message with options to choose from. + # error message with options to choose from. self.errors[:uploadFilePath] << { :message => "Zip file detected, choose the master file.", :options => files } return false elsif zip and not self.masterFileName.nil? - #if zip and the user chose a file then we make sure the file is in the list. + # if zip and the user chose a file then we make sure the file is in the list. files = LinkedData::Utils::FileHelpers.files_from_zip(self.uploadFilePath) if not files.include? self.masterFileName if self.errors[:uploadFilePath].nil? @@ -462,9 +454,7 @@ def unzip_submission(logger) if zipped? zip_dst = self.zip_folder - if Dir.exist? zip_dst - FileUtils.rm_r [zip_dst] - end + FileUtils.rm_r [zip_dst] if Dir.exist? zip_dst FileUtils.mkdir_p zip_dst extracted = LinkedData::Utils::FileHelpers.unzip(self.uploadFilePath, zip_dst) @@ -490,7 +480,7 @@ def delete_old_submission_files FileUtils.rm(submission_files, force: true) submission_folders = FOLDERS_TO_DELETE.map { |f| File.join(path_to_repo, f) } - submission_folders.each {|d| FileUtils.remove_dir(d) if File.directory?(d)} + submission_folders.each { |d| FileUtils.remove_dir(d) if File.directory?(d) } end def zip_submission_uploaded_file @@ -499,10 +489,8 @@ def zip_submission_uploaded_file return self.uploadFilePath if zipped? return self.uploadFilePath if self.uploadFilePath.nil? || self.uploadFilePath.empty? - return self.uploadFilePath if File.size(self.uploadFilePath) < FILE_SIZE_ZIPPING_THRESHOLD - old_path = self.uploadFilePath new_path = Utils::FileHelpers.zip_file(old_path) FileUtils.rm(old_path, force: true) @@ -564,9 +552,7 @@ def class_count(logger = nil) unless mx.empty? count = mx[1][0].to_i - if self.hasOntologyLanguage.skos? - count += mx[1][1].to_i - end + count += mx[1][1].to_i if self.hasOntologyLanguage.skos? count_set = true end end @@ -645,9 +631,7 @@ def generate_rdf(logger, reasoning: true) end owlapi = owlapi_parser(logger: logger) - if !reasoning - owlapi.disable_reasoner - end + owlapi.disable_reasoner if !reasoning triples_file_path, missing_imports = owlapi.parse if missing_imports && missing_imports.length > 0 @@ -712,7 +696,10 @@ def loop_classes(logger, raw_paging, callbacks) iterate_classes = false # 1. init artifacts hash if not explicitly passed in the callback # 2. determine if class-level iteration is required - callbacks.each { |_, callback| callback[:artifacts] ||= {}; iterate_classes = true if callback[:caller_on_each] } + callbacks.each { |_, callback| callback[:artifacts] ||= {}; + if callback[:caller_on_each] + iterate_classes = true + end } process_callbacks(logger, callbacks, :caller_on_pre) { |callable, callback| callable.call(callback[:artifacts], logger, paging) } @@ -736,7 +723,9 @@ def loop_classes(logger, raw_paging, callbacks) logger.error("Empty page encountered. Retrying #{j} times...") sleep(2) page_classes = paging.page(page, size).all - logger.info("Success retrieving a page of #{page_classes.length} classes after retrying #{j} times...") unless page_classes.empty? + unless page_classes.empty? + logger.info("Success retrieving a page of #{page_classes.length} classes after retrying #{j} times...") + end end if page_classes.empty? @@ -824,14 +813,10 @@ def generate_missing_labels_each(artifacts = {}, logger, paging, page_classes, p if rdfs_labels && rdfs_labels.length > 1 && c.synonym.length > 0 rdfs_labels = (Set.new(c.label) - Set.new(c.synonym)).to_a.first - if rdfs_labels.nil? || rdfs_labels.length == 0 - rdfs_labels = c.label - end + rdfs_labels = c.label if rdfs_labels.nil? || rdfs_labels.length == 0 end - if rdfs_labels and not (rdfs_labels.instance_of? Array) - rdfs_labels = [rdfs_labels] - end + rdfs_labels = [rdfs_labels] if rdfs_labels and not (rdfs_labels.instance_of? Array) label = nil if rdfs_labels && rdfs_labels.length > 0 @@ -919,20 +904,16 @@ def generate_obsolete_classes(logger, file_path) WHERE { ?class_id #{predicate_obsolete.to_ntriples} ?deprecated . } eos Goo.sparql_query_client.query(query_obsolete_predicate).each_solution do |sol| - unless ["0", "false"].include? sol[:deprecated].to_s - classes_deprecated << sol[:class_id].to_s - end + classes_deprecated << sol[:class_id].to_s unless ["0", "false"].include? sol[:deprecated].to_s end logger.info("Obsolete found #{classes_deprecated.length} for property #{self.obsoleteProperty.to_s}") end if self.obsoleteParent.nil? - #try to find oboInOWL obsolete. + # try to find oboInOWL obsolete. obo_in_owl_obsolete_class = LinkedData::Models::Class .find(LinkedData::Utils::Triples.obo_in_owl_obsolete_uri) .in(self).first - if obo_in_owl_obsolete_class - self.obsoleteParent = LinkedData::Utils::Triples.obo_in_owl_obsolete_uri - end + self.obsoleteParent = LinkedData::Utils::Triples.obo_in_owl_obsolete_uri if obo_in_owl_obsolete_class end if self.obsoleteParent class_obsolete_parent = LinkedData::Models::Class @@ -968,7 +949,7 @@ def add_submission_status(status) valid = status.is_a?(LinkedData::Models::SubmissionStatus) raise ArgumentError, "The status being added is not SubmissionStatus object" unless valid - #archive removes the other status + # archive removes the other status if status.archived? self.submissionStatus = [status] return self.submissionStatus @@ -980,7 +961,9 @@ def add_submission_status(status) if (status.error?) # remove the corresponding non_error status (if exists) non_error_status = status.get_non_error_status() - s.reject! { |stat| stat.get_code_from_id() == non_error_status.get_code_from_id() } unless non_error_status.nil? + unless non_error_status.nil? + s.reject! { |stat| stat.get_code_from_id() == non_error_status.get_code_from_id() } + end else # remove the corresponding non_error status (if exists) error_status = status.get_error_status() @@ -1048,7 +1031,7 @@ def archive_submission self.submissionStatus = nil status = LinkedData::Models::SubmissionStatus.find("ARCHIVED").first add_submission_status(status) - + unindex # Delete everything except for original ontology file. ontology.bring(:submissions) submissions = ontology.submissions @@ -1102,7 +1085,7 @@ def process_submission(logger, options = {}) else process_rdf = options[:process_rdf] == true ? true : false generate_missing_labels = options[:generate_missing_labels].nil? ? process_rdf : options[:generate_missing_labels] - extract_metadata = options[:extract_metadata].nil? ? process_rdf : options[:extract_metadata] + extract_metadata = options[:extract_metadata].nil? ? process_rdf : options[:extract_metadata] index_search = options[:index_search] == true ? true : false index_properties = options[:index_properties] == true ? true : false index_commit = options[:index_commit] == true ? true : false @@ -1153,8 +1136,7 @@ def process_submission(logger, options = {}) raise ArgumentError, error end status = LinkedData::Models::SubmissionStatus.find("RDF").first - remove_submission_status(status) #remove RDF status before starting - + remove_submission_status(status) # remove RDF status before starting generate_rdf(logger, reasoning: reasoning) add_submission_status(status) @@ -1193,43 +1175,45 @@ def process_submission(logger, options = {}) raw_paging = LinkedData::Models::Class.in(self).include(:prefLabel, :synonym, :label) loop_classes(logger, raw_paging, callbacks) - status = LinkedData::Models::SubmissionStatus.find("OBSOLETE").first - begin - generate_obsolete_classes(logger, file_path) - add_submission_status(status) - self.save - rescue Exception => e - logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") - logger.flush - add_submission_status(status.get_error_status) - self.save - # if obsolete fails the parsing fails - raise e - end + status = LinkedData::Models::SubmissionStatus.find("OBSOLETE").first + begin + generate_obsolete_classes(logger, file_path) + add_submission_status(status) + self.save + rescue Exception => e + logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") + logger.flush + add_submission_status(status.get_error_status) + self.save + # if obsolete fails the parsing fails + raise e end + end parsed = ready?(status: %i[rdf rdf_labels]) if index_search - raise Exception, "The submission #{self.ontology.acronym}/submissions/#{self.submissionId} cannot be indexed because it has not been successfully parsed" unless parsed + unless parsed + raise Exception, "The submission #{self.ontology.acronym}/submissions/#{self.submissionId} cannot be indexed because it has not been successfully parsed" + end status = LinkedData::Models::SubmissionStatus.find("INDEXED").first begin - index(logger, index_commit, false) + index_terms(logger, index_commit, false) add_submission_status(status) rescue Exception => e logger.error("#{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") logger.flush add_submission_status(status.get_error_status) - if File.file?(self.csv_path) - FileUtils.rm(self.csv_path) - end + FileUtils.rm(self.csv_path) if File.file?(self.csv_path) ensure self.save end end if index_properties - raise Exception, "The properties for the submission #{self.ontology.acronym}/submissions/#{self.submissionId} cannot be indexed because it has not been successfully parsed" unless parsed + unless parsed + raise Exception, "The properties for the submission #{self.ontology.acronym}/submissions/#{self.submissionId} cannot be indexed because it has not been successfully parsed" + end status = LinkedData::Models::SubmissionStatus.find("INDEXED_PROPERTIES").first begin index_properties(logger, index_commit, false) @@ -1244,7 +1228,9 @@ def process_submission(logger, options = {}) end if run_metrics - raise Exception, "Metrics cannot be generated on the submission #{self.ontology.acronym}/submissions/#{self.submissionId} because it has not been successfully parsed" unless parsed + unless parsed + raise Exception, "Metrics cannot be generated on the submission #{self.ontology.acronym}/submissions/#{self.submissionId} because it has not been successfully parsed" + end status = LinkedData::Models::SubmissionStatus.find("METRICS").first begin process_metrics(logger) @@ -1324,7 +1310,7 @@ def process_metrics(logger) self end - def index(logger, commit = true, optimize = true) + def index_terms(logger, commit = true, optimize = true) page = 0 size = 1000 count_classes = 0 @@ -1339,7 +1325,7 @@ def index(logger, commit = true, optimize = true) begin logger.info("Indexing ontology terms: #{self.ontology.acronym}...") t0 = Time.now - self.ontology.unindex(false) + self.ontology.unindex_all_data(false) logger.info("Removed ontology terms index (#{Time.now - t0}s)"); logger.flush paging = LinkedData::Models::Class.in(self).include(:unmapped).aggregate(:count, :children).page(page, size) @@ -1383,7 +1369,9 @@ def index(logger, commit = true, optimize = true) logger.error("Thread #{num + 1}: Empty page encountered. Retrying #{j} times...") sleep(2) page_classes = paging.page(page, size).all - logger.info("Thread #{num + 1}: Success retrieving a page of #{page_classes.length} classes after retrying #{j} times...") unless page_classes.empty? + unless page_classes.empty? + logger.info("Thread #{num + 1}: Success retrieving a page of #{page_classes.length} classes after retrying #{j} times...") + end end if page_classes.empty? @@ -1417,7 +1405,7 @@ def index(logger, commit = true, optimize = true) Thread.current["page_classes"].each do |c| begin # this cal is needed for indexing of properties - LinkedData::Models::Class.map_attributes(c, paging.equivalent_predicates, include_languages: true ) + LinkedData::Models::Class.map_attributes(c, paging.equivalent_predicates, include_languages: true) rescue Exception => e i = 0 num_calls = LinkedData.settings.num_retries_4store @@ -1531,14 +1519,14 @@ def index_properties(logger, commit = true, optimize = true) if optimize logger.info("Optimizing ontology properties index...") time = Benchmark.realtime do - LinkedData::Models::OntologyProperty.indexOptimize(nil ) + LinkedData::Models::OntologyProperty.indexOptimize(nil) end logger.info("Completed optimizing ontology properties index in #{time} seconds.") end end # Override delete to add removal from the search index - #TODO: revise this with a better process + # TODO: revise this with a better process def delete(*args) options = {} args.each { |e| options.merge!(e) if e.is_a?(Hash) } @@ -1546,8 +1534,7 @@ def delete(*args) index_commit = options[:index_commit] == false ? false : true super(*args) - self.ontology.unindex(index_commit) - self.ontology.unindex_properties(index_commit) + self.ontology.unindex_all_data(index_commit) self.bring(:metrics) if self.bring?(:metrics) self.metrics.delete if self.metrics @@ -1560,7 +1547,7 @@ def delete(*args) prev_sub = self.ontology.latest_submission() if prev_sub - prev_sub.index(LinkedData::Parser.logger || Logger.new($stderr)) + prev_sub.index_terms(LinkedData::Parser.logger || Logger.new($stderr)) prev_sub.index_properties(LinkedData::Parser.logger || Logger.new($stderr)) end end @@ -1640,15 +1627,11 @@ def roots(extra_include = [], page = nil, pagesize = nil, concept_schemes: [], c load_children = [:children] end - if extra_include.length > 0 - where.include(extra_include) - end + where.include(extra_include) if extra_include.length > 0 end where.all - if load_children.length > 0 - LinkedData::Models::Class.partially_load_children(classes, 99, self) - end + LinkedData::Models::Class.partially_load_children(classes, 99, self) if load_children.length > 0 classes.delete_if { |c| obs = !c.obsolete.nil? && c.obsolete == true @@ -1793,9 +1776,7 @@ def check_ftp_file(uri) def self.loom_transform_literal(lit) res = [] lit.each_char do |c| - if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') - res << c.downcase - end + res << c.downcase if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') end return res.join '' end diff --git a/lib/ontologies_linked_data/models/users/user.rb b/lib/ontologies_linked_data/models/users/user.rb index ec99a45c..470bbc49 100644 --- a/lib/ontologies_linked_data/models/users/user.rb +++ b/lib/ontologies_linked_data/models/users/user.rb @@ -57,6 +57,10 @@ def self.show_apikey?(inst) end end + def embedded_doc + "#{self.firstName} #{self.lastName} #{self.username}" + end + def initialize(attributes = {}) # Don't allow passwordHash to be set here attributes.delete(:passwordHash) diff --git a/lib/ontologies_linked_data/sample_data/ontology.rb b/lib/ontologies_linked_data/sample_data/ontology.rb index 0528e238..61dcc04d 100644 --- a/lib/ontologies_linked_data/sample_data/ontology.rb +++ b/lib/ontologies_linked_data/sample_data/ontology.rb @@ -44,7 +44,7 @@ def self.create_ontologies_and_submissions(options = {}) o = LinkedData::Models::Ontology.new({ acronym: acronym_count, - name: name || "#{acronym_count} Ontology", + name: name ? "#{name}#{count > 0 ? count : ''}" : "#{acronym_count} Ontology", administeredBy: [u], summaryOnly: false, ontologyType: ontology_type diff --git a/test/models/test_search.rb b/test/models/test_search.rb new file mode 100644 index 00000000..6a92b606 --- /dev/null +++ b/test/models/test_search.rb @@ -0,0 +1,180 @@ +require_relative '../test_case' + +class TestSearch < LinkedData::TestCase + + def self.after_suite + backend_4s_delete + LinkedData::Models::Ontology.indexClear + LinkedData::Models::Agent.indexClear + end + + def setup + self.class.after_suite + end + + def test_search_ontology + ont_count, ont_acronyms, created_ontologies = create_ontologies_and_submissions({ + process_submission: false, + acronym: 'BROTEST', + name: 'ontTEST Bla', + file_path: '../../../../test/data/ontology_files/BRO_v3.2.owl', + ont_count: 3, + submission_count: 3 + }) + + ontologies = LinkedData::Models::Ontology.search('*:*', { fq: 'resource_model: "ontology"' })['response']['docs'] + assert_equal 3, ontologies.size + ontologies.each do |ont| + select_ont = created_ontologies.select { |ont_created| ont_created.id.to_s.eql?(ont['id']) }.first + refute_nil select_ont + select_ont.bring_remaining + assert_equal ont['name_text'], select_ont.name + assert_equal ont['acronym_text'], select_ont.acronym + assert_equal ont['viewingRestriction_t'], select_ont.viewingRestriction + assert_equal ont['ontologyType_t'], select_ont.ontologyType.id + end + + submissions = LinkedData::Models::Ontology.search('*:*', { fq: 'resource_model: "ontology_submission"' })['response']['docs'] + assert_equal 9, submissions.size + + submissions.each do |sub| + created_sub = LinkedData::Models::OntologySubmission.find(RDF::URI.new(sub['id'])).first&.bring_remaining + refute_nil created_sub + assert_equal sub['description_text'], created_sub.description + assert_equal sub['submissionId_i'], created_sub.submissionId + assert_equal sub['URI_text'], created_sub.URI + assert_equal sub['status_t'], created_sub.status + assert_equal sub['deprecated_b'], created_sub.deprecated + assert_equal sub['hasOntologyLanguage_t'], created_sub.hasOntologyLanguage.id.to_s + assert_equal sub['released_dt'], created_sub.released.utc.strftime('%Y-%m-%dT%H:%M:%SZ') + assert_equal sub['creationDate_dt'], created_sub.creationDate.utc.strftime('%Y-%m-%dT%H:%M:%SZ') + assert_equal(sub['contact_txt'], created_sub.contact.map { |x| x.bring_remaining.embedded_doc }) + assert_equal sub['dataDump_t'], created_sub.dataDump + assert_equal sub['csvDump_t'], created_sub.csvDump + assert_equal sub['uriLookupEndpoint_t'], created_sub.uriLookupEndpoint + assert_equal sub['openSearchDescription_t'], created_sub.openSearchDescription + assert_equal sub['endpoint_txt'], created_sub.endpoint + assert_equal sub['uploadFilePath_t'], created_sub.uploadFilePath + assert_equal sub['submissionStatus_txt'], created_sub.submissionStatus.map(&:id) + embed_doc = created_sub.ontology.bring_remaining.embedded_doc + embed_doc.each do |k, v| + if v.is_a?(Array) + assert_equal v, Array(sub["ontology_#{k}"]) + else + assert_equal v, sub["ontology_#{k}"] + end + end + end + end + + def test_search_agents + @@user1 = LinkedData::Models::User.new(:username => 'user111221', :email => 'some111221@email.org') + @@user1.passwordHash = 'some random pass hash' + @@user1.save + + @agents = [ + LinkedData::Models::Agent.new(name: 'name 0', email: 'test_0@test.com', agentType: 'organization', creator: @@user1), + LinkedData::Models::Agent.new(name: 'name 1', email: 'test_1@test.com', agentType: 'organization', creator: @@user1), + LinkedData::Models::Agent.new(name: 'name 2', email: 'test_2@test.com', agentType: 'person', creator: @@user1) + ] + @identifiers = [ + LinkedData::Models::AgentIdentifier.new(notation: '000h6jb29', schemaAgency: 'ROR', creator: @@user1), + LinkedData::Models::AgentIdentifier.new(notation: '000h6jb29', schemaAgency: 'ORCID', creator: @@user1), + ] + + @identifiers.each { |i| i.save } + affiliations = @agents[0..1].map { |a| a.save } + agent = @agents.last + agent.affiliations = affiliations + + agent.identifiers = @identifiers + agent.save + + agents = LinkedData::Models::Agent.search('*:*')['response']['docs'] + + assert_equal 3, agents.size + agents.each do |a| + select_agent = @agents.select { |agent_created| agent_created.id.to_s.eql?(a['id']) }.first + refute_nil select_agent + select_agent.bring_remaining + + assert_equal a['name_text'], select_agent.name + assert_equal a['email_text'], select_agent.email + assert_equal a['agentType_t'], select_agent.agentType + assert_equal(a['affiliations_txt'], select_agent.affiliations&.map { |x| x.bring_remaining.embedded_doc }) + assert_equal(a['identifiers_texts'], select_agent.identifiers&.map { |x| x.bring_remaining.embedded_doc }) + assert_equal a['creator_t'], select_agent.creator.bring_remaining.embedded_doc + end + + @identifiers.each { |i| i.delete } + @agents.each { |a| a.delete } + @@user1.delete + end + + def test_search_ontology_data + ont_count, ont_acronyms, created_ontologies = create_ontologies_and_submissions({ + process_submission: true, + process_options: { + process_rdf: true, extract_metadata: false, + generate_missing_labels: false, + index_search: false, + }, + acronym: 'BROTEST', + name: 'ontTEST Bla', + file_path: 'test/data/ontology_files/thesaurusINRAE_nouv_structure.skos', + ont_count: 1, + submission_count: 1, + ontology_format: 'SKOS' + }) + ont_sub = LinkedData::Models::Ontology.find('BROTEST-0').first + ont_sub = ont_sub.latest_submission + time = Benchmark.realtime do + ont_sub.index_all_data(Logger.new($stdout)) + end + puts time + conn = Goo.search_client(:ontology_data) + response = conn.search('*') + + count = Goo.sparql_query_client.query("SELECT (COUNT( DISTINCT ?id) as ?c) FROM <#{ont_sub.id}> WHERE {?id ?p ?v}") + .first[:c] + .to_i + + assert_equal count, response['response']['numFound'] + + response = conn.search('*', fq: ' resource_id:"http://opendata.inrae.fr/thesaurusINRAE/c_10065"') + + assert_equal 1, response['response']['numFound'] + doc = response['response']['docs'].first + + expected_doc = { + 'id' => 'http://opendata.inrae.fr/thesaurusINRAE/c_10065_BROTEST-0', + 'submission_id_t' => 'http://data.bioontology.org/ontologies/BROTEST-0/submissions/1', + 'ontology_t' => 'BROTEST-0', + 'resource_id' => 'http://opendata.inrae.fr/thesaurusINRAE/c_10065', + 'type_txt' => %w[http://www.w3.org/2004/02/skos/core#Concept http://www.w3.org/2002/07/owl#NamedIndividual], + 'http___www.w3.org_2004_02_skos_core_inScheme_txt' => %w[http://opendata.inrae.fr/thesaurusINRAE/thesaurusINRAE http://opendata.inrae.fr/thesaurusINRAE/mt_53], + 'http___www.w3.org_2004_02_skos_core_broader_t' => 'http://opendata.inrae.fr/thesaurusINRAE/c_9937', + 'http___www.w3.org_2004_02_skos_core_altLabel_txt' => ['GMO food', + 'aliment transgénique', + 'aliment OGM', + 'transgenic food'], + 'http___www.w3.org_2004_02_skos_core_prefLabel_txt' => ['genetically modified food', + 'aliment génétiquement modifié'], + 'resource_model' => 'ontology_submission' + } + + doc.delete('_version_') + + assert_equal expected_doc['id'], doc['id'] + assert_equal expected_doc['submission_id_t'], doc['submission_id_t'] + assert_equal expected_doc['ontology_t'], doc['ontology_t'] + assert_equal expected_doc['resource_id'], doc['resource_id'] + assert_equal expected_doc['type_txt'].sort, doc['type_txt'].sort + assert_equal expected_doc['http___www.w3.org_2004_02_skos_core_inScheme_txt'].sort, doc['http___www.w3.org_2004_02_skos_core_inScheme_txt'].sort + assert_equal expected_doc['http___www.w3.org_2004_02_skos_core_broader_t'], doc['http___www.w3.org_2004_02_skos_core_broader_t'] + assert_equal expected_doc['http___www.w3.org_2004_02_skos_core_altLabel_txt'].sort, doc['http___www.w3.org_2004_02_skos_core_altLabel_txt'].sort + assert_equal expected_doc['http___www.w3.org_2004_02_skos_core_prefLabel_txt'].sort, doc['http___www.w3.org_2004_02_skos_core_prefLabel_txt'].sort + assert_equal expected_doc['resource_model'], doc['resource_model'] + + end +end