diff --git a/.gitignore b/.gitignore index d54317d8..9d390c9b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,36 +1,40 @@ -data -start_solr.sh -stop_solr.sh .local .solargraph.yml .idea .cache .irb_history .byebug_history -ht_secure_data.rb -logs -overlap/overlap_umich.tsv -tmp/ -lib/translation_maps/hlb.json.gz -lib/translation_maps/umich/libLocInfo.yaml .bash_history .env .bundle -/umich_catalog_indexing/.gem -/umich_catalog_indexing/.env -/umich_catalog_indexing/debug* -/umich_catalog_indexing/.m2 -/umich_catalog_indexing/.ssh/* +.gem + +umich_catalog_indexing/debug* +umich_catalog_indexing/.m2 +umich_catalog_indexing/.ssh/* umich_catalog_indexing/coverage/ + +umich_catalog_indexing/lib/translation_maps/hlb.json.gz +umich_catalog_indexing/lib/translation_maps/umich/libLocInfo.yaml + +umich_catalog_indexing/scratch/* +!umich_catalog_indexing/scratch/.keep + +umich_catalog_indexing/examples/*.xml +umich_catalog_indexing/examples/*/*.xml +umich_catalog_indexing/examples/*.tar.gz +umich_catalog_indexing/examples/*/*.tar.gz + /sftp/ssh/* -!/sftp/ssh/README.md +!/sftp/ssh/.keep /sftp/search_daily_bibs/*.xml -/umich_catalog_indexing/*.xml -/umich_catalog_indexing/*/*.xml -/umich_catalog_indexing/*/*/*.xml -*.tar.gz +/sftp/search_daily_bibs/*.tar.gz !sftp/search_daily_bibs/sample.tar.gz !sftp/search_daily_bibs/birds_2022021017_21131448650006381_new.tar.gz + support_dbs/scratch/* !support_dbs/scratch/.keep + biblio/biblio.zip + +overlap/*.tsv diff --git a/compose.yml b/compose.yml index 11c9c7a6..613f362d 100644 --- a/compose.yml +++ b/compose.yml @@ -55,10 +55,6 @@ services: - ./umich_catalog_indexing/.:/app - ./sftp/ssh/ssh_client_rsa_key:/etc/secret-volume/id_rsa:ro - gem_cache:/gems - environment: - - REDIS_URL=redis://redis:6379 - - HLB_XML_ENDPOINT=https://apps.lib.umich.edu/browse/categories/xml.php - - NODB=1 env_file: - ./umich_catalog_indexing/.env - ./umich_catalog_indexing/env.development @@ -109,15 +105,15 @@ services: #- MARIADB_PASSWORD=password #- MARIADB_DATABASE=hathifiles - #hathioverlap: - #image: mariadb - #volumes: - #- overlap:/var/lib/mysql - #environment: - #- MARIADB_ROOT_PASSWORD=pass - #- MARIADB_USER=user - #- MARIADB_PASSWORD=password - #- MARIADB_DATABASE=overlap + hathioverlap: + build: overlap/. + volumes: + - overlap:/var/lib/mysql + environment: + - MARIADB_ROOT_PASSWORD=pass + - MARIADB_USER=user + - MARIADB_PASSWORD=password + - MARIADB_DATABASE=overlap prometheus: image: prom/prometheus diff --git a/umich_catalog_indexing/scratch/zephir_upd_20220301.json.gz b/sftp/search_daily_bibs/zephir_upd_20220301.json.gz similarity index 100% rename from umich_catalog_indexing/scratch/zephir_upd_20220301.json.gz rename to sftp/search_daily_bibs/zephir_upd_20220301.json.gz diff --git a/umich_catalog_indexing/scratch/README.md b/sftp/ssh/.keep similarity index 100% rename from umich_catalog_indexing/scratch/README.md rename to sftp/ssh/.keep diff --git a/umich_catalog_indexing/env.development b/umich_catalog_indexing/env.development index 5fc0ee2c..841a7605 100644 --- a/umich_catalog_indexing/env.development +++ b/umich_catalog_indexing/env.development @@ -1,14 +1,3 @@ -HATHIFILE_HOST=hathidb -HATHIFILE_DB=hathifiles -HATHIFILE_USER=root -HATHIFILE_PASSWORD=pass -HATHI_OVERLAP_HOST=hathioverlap -HATHI_OVERLAP_DB=bibliosearch -HATHI_OVERLAP_USER=root -HATHI_OVERLAP_PASSWORD=pass -ALMA_API_HOST=https://api-na.hosted.exlibrisgroup.com -ALMA_FILES_USER=alma -ALMA_FILES_HOST=sftp SSH_KEY_PATH=/etc/secret-volume/id_rsa DAILY_ALMA_FILES_PATH="search_daily_bibs" MACC_PRODUCTION_SOLR_URL=http://solr:8983/solr/biblio @@ -16,7 +5,7 @@ HATCHER_PRODUCTION_SOLR_URL=http://solr:8983/solr/biblio LIVE_SOLR_URL=http://solr:8983/solr/biblio REINDEX_SOLR_URL=http://solr:8983/solr/biblio SIDEKIQ_SUPERVISOR_HOST=http://supervisor:3000 -SOLR_USER=solr -SOLR_PASSWORD=SolrRocks SOLRCLOUD_ON=true PROMETHEUS_PUSH_GATEWAY=http://pushgateway:9091 +HLB_XML_ENDPOINT=https://apps.lib.umich.edu/browse/categories/xml.php +REDIS_URL=redis://redis:6379 diff --git a/umich_catalog_indexing/indexers/settings.rb b/umich_catalog_indexing/indexers/settings.rb index dea88961..66d32096 100644 --- a/umich_catalog_indexing/indexers/settings.rb +++ b/umich_catalog_indexing/indexers/settings.rb @@ -1,36 +1,36 @@ $:.unshift "#{File.dirname(__FILE__)}/../lib" -require 'set' +require "services" +require "set" -require 'library_stdnums' +require "library_stdnums" -require 'traject/macros/marc21_semantics' +require "traject/macros/marc21_semantics" extend Traject::Macros::Marc21Semantics -require 'traject/macros/marc_format_classifier' +require "traject/macros/marc_format_classifier" extend Traject::Macros::MarcFormats -require 'ht_traject' +require "ht_traject" extend HathiTrust::Traject::Macros extend Traject::UMichFormat::Macros -require 'marc/fastxmlwriter' +require "marc/fastxmlwriter" -require 'marc_record_speed_monkeypatch' -require 'marc4j_fix' +require "marc_record_speed_monkeypatch" +require "marc4j_fix" -UmichOverlap = if ENV['NODB'] - require "ht_traject/no_db_mocks/ht_overlap" - HathiTrust::NoDB::UmichOverlap - else - require 'ht_traject/ht_overlap.rb' - HathiTrust::UmichOverlap - end +UmichOverlap = if S.no_db? + require "ht_traject/no_db_mocks/ht_overlap" + HathiTrust::NoDB::UmichOverlap +else + require "ht_traject/ht_overlap" + HathiTrust::UmichOverlap +end settings do store "log.batch_progress", 10_000 end - logger.info RUBY_DESCRIPTION ################################ @@ -39,4 +39,3 @@ # Set up an area in the clipboard for use storing intermediate stuff each_record HathiTrust::Traject::Macros.setup - diff --git a/umich_catalog_indexing/indexers/umich_alma.rb b/umich_catalog_indexing/indexers/umich_alma.rb index 3a9597e5..ccb1abe9 100644 --- a/umich_catalog_indexing/indexers/umich_alma.rb +++ b/umich_catalog_indexing/indexers/umich_alma.rb @@ -1,30 +1,29 @@ -require 'umich_traject' -require 'ht_traject' -#require 'ht_traject/ht_overlap.rb' -require 'json' -require 'umich_traject/floor_location.rb' - -HathiFiles = if ENV['NODB'] - require 'ht_traject/no_db_mocks/ht_hathifiles' - HathiTrust::NoDB::HathiFiles - else - require 'ht_traject/ht_hathifiles.rb' - HathiTrust::HathiFiles - end - -libLocInfo = Traject::TranslationMap.new('umich/libLocInfo') -electronic_collections = Traject::TranslationMap.new('umich/electronic_collections') +require "umich_traject" +require "ht_traject" +# require 'ht_traject/ht_overlap.rb' +require "json" +require "umich_traject/floor_location" + +HathiFiles = if ENV["NODB"] + require "ht_traject/no_db_mocks/ht_hathifiles" + HathiTrust::NoDB::HathiFiles +else + require "ht_traject/ht_hathifiles" + HathiTrust::HathiFiles +end -UMich::FloorLocation.configure('lib/translation_maps/umich/floor_locations.json') +libLocInfo = Traject::TranslationMap.new("umich/libLocInfo") +electronic_collections = Traject::TranslationMap.new("umich/electronic_collections") +UMich::FloorLocation.configure("lib/translation_maps/umich/floor_locations.json") -# skip course reserve records +# skip course reserve records each_record do |r, context| cr_pattern = /CR_RESTRICTED/ - r.each_by_tag('999') do |f| - if f['a'] and f['a'] =~ /CR_RESTRICTED/ - id = context.output_hash['id'] + r.each_by_tag("999") do |f| + if f["a"] and f["a"] =~ /CR_RESTRICTED/ + id = context.output_hash["id"] context.skip!("#{id} : Course reserve record skipped") end end @@ -32,9 +31,9 @@ # 035 $a (MiU)003113534MIU01 aleph_pattern = /^\(MiU\)\d{9}MIU01$/ -to_field 'aleph_id' do |record, acc, context| - if context.clipboard[:ht][:record_source] == 'alma' - aleph_spec = Traject::MarcExtractor.cached('035a') +to_field "aleph_id" do |record, acc, context| + if context.clipboard[:ht][:record_source] == "alma" + aleph_spec = Traject::MarcExtractor.cached("035a") aleph_spec.extract(record).grep(aleph_pattern).each { |alephnum| acc << alephnum[5, 9] break # single-valued field, some alma records have multiple occurrences, so only use first @@ -42,75 +41,74 @@ end end -cc_to_of = Traject::TranslationMap.new('ht/collection_code_to_original_from') +cc_to_of = Traject::TranslationMap.new("ht/collection_code_to_original_from") each_record do |r, context| - - locations = Array.new() - inst_codes = Array.new() - availability = Array.new() - sh = Hash.new() + locations = [] + inst_codes = [] + availability = [] + sh = {} has_e56 = false - id = context.output_hash['id'] + id = context.output_hash["id"] - # "OWN" field - r.each_by_tag(['958', 'OWN']) do |f| - locations << f['a'].upcase if f['a'] - inst_codes << f['a'].upcase if f['a'] + # "OWN" field + r.each_by_tag(["958", "OWN"]) do |f| + locations << f["a"].upcase if f["a"] + inst_codes << f["a"].upcase if f["a"] end - hol_list = Array.new() + hol_list = [] # this is ugly--needs to be refactored - if context.clipboard[:ht][:record_source] == 'zephir' - #cc_to_of = Traject::TranslationMap.new('ht/collection_code_to_original_from') + if context.clipboard[:ht][:record_source] == "zephir" + # cc_to_of = Traject::TranslationMap.new('ht/collection_code_to_original_from') # add hol for HT volumes - items = Array.new() - #etas_status = context.clipboard[:ht][:overlap][:count_etas] > 0 # make it a boolean - r.each_by_tag('974') do |f| - next unless f['u'] - item = Hash.new() - item[:id] = f['u'] - item[:rights] = f['r'] - item[:description] = f['z'] - item[:collection_code] = f['c'] - item[:source] = cc_to_of[f['c'].downcase] + items = [] + # etas_status = context.clipboard[:ht][:overlap][:count_etas] > 0 # make it a boolean + r.each_by_tag("974") do |f| + next unless f["u"] + item = {} + item[:id] = f["u"] + item[:rights] = f["r"] + item[:description] = f["z"] + item[:collection_code] = f["c"] + item[:source] = cc_to_of[f["c"].downcase] item[:access] = !!(item[:rights] =~ /^(pd|world|ic-world|cc|und-world)/) - #item[:status] = statusFromRights(item[:rights], etas_status) + # item[:status] = statusFromRights(item[:rights], etas_status) item[:status] = statusFromRights(item[:rights]) items << item end if items.any? - hol = Hash.new() - hol[:library] = 'HathiTrust Digital Library' + hol = {} + hol[:library] = "HathiTrust Digital Library" hol[:items] = sortItems(items) hol_list << hol - locations << 'MiU' - inst_codes << 'MIU' - inst_codes << 'MIFLIC' + locations << "MiU" + inst_codes << "MIU" + inst_codes << "MIFLIC" # get ht-related availability values - availability << 'avail_ht' + availability << "avail_ht" hol[:items].each do |item| - availability << 'avail_ht_fulltext' if item[:access] - availability << 'avail_online' if item[:access] + availability << "avail_ht_fulltext" if item[:access] + availability << "avail_online" if item[:access] end - #availability << 'avail_ht_etas' if context.clipboard[:ht][:overlap][:count_etas] > 0 + # availability << 'avail_ht_etas' if context.clipboard[:ht][:overlap][:count_etas] > 0 end else - holdings = Traject::UMich::Holdings.new(r, context, libLocInfo, UMich::FloorLocation, HathiFiles).run + holdings = Traject::UMich::Holdings.new(r, context, libLocInfo, UMich::FloorLocation, HathiFiles).run locations.push(*holdings[:locations]) inst_codes.push(*holdings[:inst_codes]) availability.push(*holdings[:availability]) hol_list.push(*holdings[:hol_list]) end - - #TODO This is how to empty it - if hol_list.empty? - id = context.output_hash['id']&.first || "" - electronic_collections_for_id = electronic_collections[id] + + # TODO This is how to empty it + if hol_list.empty? + id = context.output_hash["id"]&.first || "" + electronic_collections_for_id = electronic_collections[id] if electronic_collections_for_id - + hol_list = electronic_collections_for_id.map do |x| { link: x["link"], @@ -123,12 +121,12 @@ finding_aid: false } end - availability << 'avail_online' + availability << "avail_online" locations << "ELEC" end - #this message is in debug - #I think this will be check for level url (from the translation map from - #alma api.) If so, add in that electronic item. + # this message is in debug + # I think this will be check for level url (from the translation map from + # alma api.) If so, add in that electronic item. # maybe look for the coming soon? I need to go look elsewhere probably # else suppress end @@ -136,33 +134,32 @@ context.clipboard[:ht][:availability] = availability.compact.uniq.sort context.clipboard[:ht][:locations] = locations.compact.uniq.sort context.clipboard[:ht][:inst_codes] = inst_codes.compact.uniq.sort - end -to_field 'hol' do |record, acc, context| +to_field "hol" do |record, acc, context| acc << context.clipboard[:ht][:hol_list].to_json end -to_field 'availability' do |record, acc, context| - avail_map = Traject::TranslationMap.new('umich/availability_map_umich') +to_field "availability" do |record, acc, context| + avail_map = Traject::TranslationMap.new("umich/availability_map_umich") acc.replace Array(context.clipboard[:ht][:availability].map { |code| avail_map[code] }) end location_map = Traject::UMich.location_map -to_field 'location' do |record, acc, context| +to_field "location" do |record, acc, context| locations = Array(context.clipboard[:ht][:locations]) - #acc.replace locations.map { |code| location_map[code] } + # acc.replace locations.map { |code| location_map[code] } acc.replace locations acc.map! { |code| location_map[code.strip] } acc.flatten! acc.uniq! end -#MIU, MIU-C, MIU-H, MIFLIC -inst_map = Traject::TranslationMap.new('umich/institution_map') -to_field 'institution' do |record, acc, context| +# MIU, MIU-C, MIU-H, MIFLIC +inst_map = Traject::TranslationMap.new("umich/institution_map") +to_field "institution" do |record, acc, context| inst_codes = Array(context.clipboard[:ht][:inst_codes]) - #acc << 'MiU' if context.clipboard[:ht][:record_source] == 'zephir' # add MiU as an institution for zephir records + # acc << 'MiU' if context.clipboard[:ht][:record_source] == 'zephir' # add MiU as an institution for zephir records acc.replace inst_codes acc.map! { |code| inst_map[code.strip] } acc.flatten! @@ -182,12 +179,12 @@ # def ejournal?(context) - elec = context.clipboard[:ht][:hol_list].any? { |hol| hol[:library]&.include? 'ELEC' } - form = context.output_hash['format'] - elec and form&.include?('Serial') + elec = context.clipboard[:ht][:hol_list].any? { |hol| hol[:library]&.include? "ELEC" } + form = context.output_hash["format"] + elec and form&.include?("Serial") end -FILING_TITLE_880_extractor = Traject::MarcExtractor.new('245abdefgknp', alternate_script: :only) +FILING_TITLE_880_extractor = Traject::MarcExtractor.new("245abdefgknp", alternate_script: :only) def filing_titles_880(r) rv = [] @@ -211,8 +208,6 @@ def latinized_in_double_brackets(str) m = DOUBLE_BRACKET_TITLE.match(str) if m m[1] - else - nil end end @@ -223,17 +218,15 @@ def latinized_after_equal_title(str) m = AFTER_EQUAL_TITLE.match(str) if m m[1] - else - nil end end # Get the filing versions of the primary title and send it to solr to # figure out where to put it in the A-Z list -- but only if it's an ejournal # -to_field 'title_initial', extract_marc_filing_version('245abdefgknp', include_original: false), - first_only, - trim_punctuation do |rec, acc, context| +to_field "title_initial", extract_marc_filing_version("245abdefgknp", include_original: false), + first_only, + trim_punctuation do |rec, acc, context| if !ejournal?(context) acc.replace [] else @@ -241,12 +234,12 @@ def latinized_after_equal_title(str) if filing_title && !string_starts_with_latin(filing_title) extra_filing_title = filing_titles_880(rec).select { |t| string_starts_with_latin(t) }.first best_guess = latinized_in_double_brackets(filing_title) || latinized_after_equal_title(filing_title) || extra_filing_title -# if !string_starts_with_latin(best_guess) -# best_guess = latinized_in_double_brackets(extra_filing_title) || latinized_after_equal_title(extra_filing_title) || filing_title -# end + # if !string_starts_with_latin(best_guess) + # best_guess = latinized_in_double_brackets(extra_filing_title) || latinized_after_equal_title(extra_filing_title) || filing_title + # end if best_guess and !best_guess.empty? acc.replace [best_guess] -# logger.info "A-Z List: replaced #{context.output_hash['title_common'].first} with #{best_guess}" + # logger.info "A-Z List: replaced #{context.output_hash['title_common'].first} with #{best_guess}" end end end @@ -254,18 +247,18 @@ def latinized_after_equal_title(str) # sorting routine for enum/chron (description) item sort def enumcronSort a, b - return a[:sortstring] <=> b[:sortstring] + a[:sortstring] <=> b[:sortstring] end # Create a sortable string based on the digit strings present in an # enumcron string def enumcronSortString str - rv = '0' + rv = "0" str.scan(/\d+/).each do |nums| rv += nums.size.to_s + nums end - return rv + rv end def sortItems arr @@ -274,21 +267,20 @@ def sortItems arr # First, add the _sortstring entries arr.each do |h| - #if h.has_key? 'description' - if h[:description] - h[:sortstring] = enumcronSortString(h[:description]) + # if h.has_key? 'description' + h[:sortstring] = if h[:description] + enumcronSortString(h[:description]) else - h[:sortstring] = '0' + "0" end end # Then sort it - arr.sort! { |a, b| self.enumcronSort(a, b) } + arr.sort! { |a, b| enumcronSort(a, b) } # Then remove the sortstrings arr.each do |h| h.delete(:sortstring) end - return arr + arr end - diff --git a/umich_catalog_indexing/lib/ht_traject/ht_dbh.rb b/umich_catalog_indexing/lib/ht_traject/ht_dbh.rb deleted file mode 100644 index 507e2172..00000000 --- a/umich_catalog_indexing/lib/ht_traject/ht_dbh.rb +++ /dev/null @@ -1,37 +0,0 @@ -#require_relative '../ht_secure_data' -require 'sequel' - -module HathiTrust - module DBH - #extend HathiTrust::SecureData - begin - DB = Sequel.connect("mysql2://#{ENV.fetch("HATHIFILE_HOST")}/#{ENV.fetch("HATHIFILE_DB")}?user=#{ENV.fetch("HATHIFILE_USER")}&password=#{ENV.fetch("HATHIFILE_PASSWORD")}&useTimezone=true&serverTimezone=UTC", login_timeout: 2, pool_timeout: 10, max_connections: 6) - rescue => e - STDERR.puts e - STDERR.puts "************************************************************" - STDERR.puts "Cannot Reach #{ENV.fetch("HATHIFILE_HOST")}" - STDERR.puts "If you're on a machine where you can't reach the database," - STDERR.puts "run with environment NODB=1 to skip all db stuff" - STDERR.puts "************************************************************" - exit 1 - end - - end - - module DBH_overlap - #extend HathiTrust::HTOverlap - begin - DB = Sequel.connect("mysql2://#{ENV.fetch("HATHI_OVERLAP_HOST")}/#{ENV.fetch("HATHI_OVERLAP_DB")}?user=#{ENV.fetch("HATHI_OVERLAP_USER")}&password=#{ENV.fetch("HATHI_OVERLAP_PASSWORD")}&useTimezone=true&serverTimezone=UTC", login_timeout: 2, pool_timeout: 10, max_connections: 6) - rescue => e - STDERR.puts e - STDERR.puts "************************************************************" - STDERR.puts "Cannot Reach #{ENV.fetch("HATHI_OVERLAP_HOST")}" - STDERR.puts "If you're on a machine where you can't reach the database," - STDERR.puts "run with environment NODB=1 to skip all db stuff" - STDERR.puts "************************************************************" - exit 1 - end - end - -end - diff --git a/umich_catalog_indexing/lib/ht_traject/ht_hathifiles.rb b/umich_catalog_indexing/lib/ht_traject/ht_hathifiles.rb index b65f2e02..8c458df0 100644 --- a/umich_catalog_indexing/lib/ht_traject/ht_hathifiles.rb +++ b/umich_catalog_indexing/lib/ht_traject/ht_hathifiles.rb @@ -1,11 +1,9 @@ -require 'traject' -require_relative 'ht_dbh' -require 'sequel' +require "traject" +require "sequel" module HathiTrust - class HathiFiles - DB = HathiTrust::DBH::DB + DB = S.hathifiles_mysql SELECTED_COLS = [ Sequel[:hf][:htid].as(:id), @@ -15,49 +13,44 @@ class HathiFiles :access ] - CC_TO_OF = ::Traject::TranslationMap.new('ht/collection_code_to_original_from') - + CC_TO_OF = ::Traject::TranslationMap.new("ht/collection_code_to_original_from") # Note how for both oclc_nums and bibs we need to map everything to strings, # since the database stores those values as strings. Confusingly, you'll get the # right answer if you send ints becauyse mysql will silently change them, but it # will then refuse to use the indexes! def self.oclc_query(oclc_nums) - - oclc_nums.map!{|num| num.to_i} + oclc_nums.map! { |num| num.to_i } oclc_join = DB[:hf].join(:hf_oclc, htid: :htid) hf_htid = Sequel[:hf][:htid] - oclc_join.select(*SELECTED_COLS). - where(value: Array(oclc_nums).map(&:to_s)) + oclc_join.select(*SELECTED_COLS) + .where(value: Array(oclc_nums).map(&:to_s)) end def self.bib_query(bib_nums) bib_join = DB[:hf].join(:hf_source_bib, htid: :htid) - bib_join.select(*SELECTED_COLS). - where(source: 'MIU'). - where(value: Array(bib_nums).map(&:to_s)) + bib_join.select(*SELECTED_COLS) + .where(source: "MIU") + .where(value: Array(bib_nums).map(&:to_s)) end def self.query(bib_nums:, oclc_nums:) - self.bib_query(bib_nums).union(self.oclc_query(oclc_nums)) + bib_query(bib_nums).union(oclc_query(oclc_nums)) end - #DB.logger = Logger.new($stdout) + # DB.logger = Logger.new($stdout) # I use a db driver per thread to avoid any conflicts def self.get_hf_info(oclc_nums, bib_nums) oclc_nums = Array(oclc_nums) bib_nums = Array(bib_nums) - hf_hash = Hash.new + hf_hash = {} - self.query(bib_nums: bib_nums, oclc_nums: oclc_nums).each do |r| + query(bib_nums: bib_nums, oclc_nums: oclc_nums).each do |r| hf_hash[r[:id]] = r - hf_hash[r[:id]]['source'] = CC_TO_OF[r[:collection_code].downcase] + hf_hash[r[:id]]["source"] = CC_TO_OF[r[:collection_code].downcase] end hf_hash.values end - end end - - diff --git a/umich_catalog_indexing/lib/ht_traject/ht_overlap.rb b/umich_catalog_indexing/lib/ht_traject/ht_overlap.rb index ce99fa70..d5b62808 100644 --- a/umich_catalog_indexing/lib/ht_traject/ht_overlap.rb +++ b/umich_catalog_indexing/lib/ht_traject/ht_overlap.rb @@ -1,18 +1,16 @@ -require 'traject' -require_relative 'ht_dbh' -require 'sequel' +require "traject" +require "sequel" module HathiTrust - class UmichOverlap - DB = HathiTrust::DBH_overlap::DB + DB = S.overlap_mysql Umich_overlap_query = DB[:overlap].select(:access) # I use a db driver per thread to avoid any conflicts def self.get_overlap(oclc_nums) oclc_nums = Array(oclc_nums) - count_all = 0 - count_etas = 0 + count_all = 0 + count_etas = 0 if oclc_nums.any? Umich_overlap_query.where(oclc: oclc_nums).each do |r| count_all += 1 @@ -25,10 +23,6 @@ def self.get_overlap(oclc_nums) count_all: count_all, count_etas: count_etas } - end end - end - - diff --git a/umich_catalog_indexing/lib/jobs/index_alma_xml.rb b/umich_catalog_indexing/lib/jobs/index_alma_xml.rb index c6243eab..1413225f 100644 --- a/umich_catalog_indexing/lib/jobs/index_alma_xml.rb +++ b/umich_catalog_indexing/lib/jobs/index_alma_xml.rb @@ -13,7 +13,7 @@ def initialize(file:, solr_url:, logger: Logger.new($stdout), end def run - @logger.info "fetching #{@file} from #{ENV.fetch("ALMA_FILES_HOST")}" + @logger.info "fetching #{@file} from #{S.sftp_host}" @alma_file_processor.run @translation_map_generator.generate_all diff --git a/umich_catalog_indexing/lib/services.rb b/umich_catalog_indexing/lib/services.rb index 0115d493..67f6d0bf 100644 --- a/umich_catalog_indexing/lib/services.rb +++ b/umich_catalog_indexing/lib/services.rb @@ -4,17 +4,8 @@ Services = Canister.new S = Services -S.register(:project_root) do - File.absolute_path(File.join(__dir__, "..")) -end - -S.register(:log_stream) do - $stdout.sync = true - $stdout -end - -Services.register(:logger) do - SemanticLogger["Catalog Indexing"] -end - -SemanticLogger.add_appender(io: S.log_stream, level: :info) unless ENV["APP_ENV"] == "test" +require_relative "services/paths" +require_relative "services/logger" +require_relative "services/dbs" +require_relative "services/solr" +require_relative "services/sftp" diff --git a/umich_catalog_indexing/lib/services/dbs.rb b/umich_catalog_indexing/lib/services/dbs.rb new file mode 100644 index 00000000..a3a8802c --- /dev/null +++ b/umich_catalog_indexing/lib/services/dbs.rb @@ -0,0 +1,40 @@ +S.register(:no_db?) { ENV["NODB"] ? true : false } + +# Overlap DB +S.register(:overlap_user) { ENV.fetch("HATHI_OVERLAP_USER", "user") } +S.register(:overlap_password) { ENV.fetch("HATHI_OVERLAP_PASSWORD", "password") } +S.register(:overlap_host) { ENV.fetch("HATHI_OVERLAP_HOST", "hathioverlap") } +S.register(:overlap_db_name) { ENV.fetch("HATHI_OVERLAP_DB", "overlap") } + +S.register(:overlap_mysql) do + Sequel.connect("mysql2://#{S.overlap_host}/#{S.overlap_db_name}?user=#{S.overlap_user}&password=#{S.overlap_password}&useTimezone=true&serverTimezone=UTC", + login_timeout: 2, + pool_timeout: 10, + max_connections: 6) +rescue => e + warn e + warn "************************************************************" + warn "Cannot Reach #{S.overlap_host}" + warn "If you're on a machine where you can't reach the database," + warn "run with environment NODB=1 to skip all db stuff" + warn "************************************************************" + exit 1 +end + +# Hathifiles DB +S.register(:hathifiles_user) { ENV.fetch("HATHIFILE_USER", "user") } +S.register(:hathifiles_password) { ENV.fetch("HATHIFILE_PASSWORD", "password") } +S.register(:hathifiles_host) { ENV.fetch("HATHIFILE_HOST", "hathifiles") } +S.register(:hathifiles_db) { ENV.fetch("HATHIFILE_DB", "hathifiles") } + +S.register(:hathifiles_mysql) do + DB = Sequel.connect("mysql2://#{S.hathifiles_host}/#{S.hathifiles_db}?user=#{S.hathifiles_user}&password=#{S.hathifiles_password}&useTimezone=true&serverTimezone=UTC", login_timeout: 2, pool_timeout: 10, max_connections: 6) +rescue => e + warn e + warn "************************************************************" + warn "Cannot Reach #{S.hathifiles_host}" + warn "If you're on a machine where you can't reach the database," + warn "run with environment NODB=1 to skip all db stuff" + warn "************************************************************" + exit 1 +end diff --git a/umich_catalog_indexing/lib/services/logger.rb b/umich_catalog_indexing/lib/services/logger.rb new file mode 100644 index 00000000..5bba80de --- /dev/null +++ b/umich_catalog_indexing/lib/services/logger.rb @@ -0,0 +1,10 @@ +S.register(:log_stream) do + $stdout.sync = true + $stdout +end + +Services.register(:logger) do + SemanticLogger["Catalog Indexing"] +end + +SemanticLogger.add_appender(io: S.log_stream, level: :info) unless ENV["APP_ENV"] == "test" diff --git a/umich_catalog_indexing/lib/services/paths.rb b/umich_catalog_indexing/lib/services/paths.rb new file mode 100644 index 00000000..82f8aa01 --- /dev/null +++ b/umich_catalog_indexing/lib/services/paths.rb @@ -0,0 +1,5 @@ +S.register(:project_root) do + File.absolute_path(File.join(__dir__, "../../")) +end + +S.register(:scratch_dir) { File.join(S.project_root, "scratch") } diff --git a/umich_catalog_indexing/lib/services/sftp.rb b/umich_catalog_indexing/lib/services/sftp.rb new file mode 100644 index 00000000..023b545e --- /dev/null +++ b/umich_catalog_indexing/lib/services/sftp.rb @@ -0,0 +1,10 @@ +require "sftp" +S.register(:sftp_user) { ENV.fetch("ALMA_FILES_USER", "alma") } +S.register(:sftp_host) { ENV.fetch("ALMA_FILES_HOST", "sftp") } +S.register(:ssh_key_path) { ENV.fetch("SSH_KEY_PATH", "/etc/secret-volume/id_rsa") } + +SFTP.configure do |config| + config.user = S.sftp_user + config.host = S.sftp_host + config.key_path = S.ssh_key_path +end diff --git a/umich_catalog_indexing/lib/services/solr.rb b/umich_catalog_indexing/lib/services/solr.rb new file mode 100644 index 00000000..de9d7fe4 --- /dev/null +++ b/umich_catalog_indexing/lib/services/solr.rb @@ -0,0 +1,7 @@ +S.register(:solrcloud_on?) do + (ENV["SOLRCLOUD_ON"] == "true") ? true : false +end +S.register(:solr_threads) { ENV.fetch("SOLR_THREADS", 1).to_i } +S.register(:solr_user) { ENV.fetch("SOLR_USER", "solr") } +S.register(:solr_password) { ENV.fetch("SOLR_PASSWORD", "SolrRocks") } +S.register(:processing_threads) { ENV.fetch("PROCESSING_THREADS", 8) } diff --git a/umich_catalog_indexing/lib/sidekiq_jobs.rb b/umich_catalog_indexing/lib/sidekiq_jobs.rb index 5c28d18f..15a5bbc6 100644 --- a/umich_catalog_indexing/lib/sidekiq_jobs.rb +++ b/umich_catalog_indexing/lib/sidekiq_jobs.rb @@ -1,6 +1,7 @@ $:.unshift File.dirname(__FILE__).to_s require "sidekiq" require "jobs" +require "services" class JobQueued def call(worker, job, queue, redis_pool) @@ -42,12 +43,6 @@ def call(worker, job, queue) end end -SFTP.configure do |config| - config.user = ENV.fetch("ALMA_FILES_USER") - config.host = ENV.fetch("ALMA_FILES_HOST") - config.key_path = ENV.fetch("SSH_KEY_PATH") -end - class IndexIt include Sidekiq::Worker def perform(file, solr_url) diff --git a/umich_catalog_indexing/overlap/02_empty_and_reload_overlap.sh b/umich_catalog_indexing/overlap/02_empty_and_reload_overlap.sh index 74c20443..b6757dcb 100644 --- a/umich_catalog_indexing/overlap/02_empty_and_reload_overlap.sh +++ b/umich_catalog_indexing/overlap/02_empty_and_reload_overlap.sh @@ -1,5 +1,5 @@ -dbname=bibliosearch -overlap_file=overlap_umich.tsv +dbname=overlap +overlap_file=/overlap.tsv tablename=overlap password=pass diff --git a/umich_catalog_indexing/overlap/Dockerfile b/umich_catalog_indexing/overlap/Dockerfile index 7f686341..08d60e49 100644 --- a/umich_catalog_indexing/overlap/Dockerfile +++ b/umich_catalog_indexing/overlap/Dockerfile @@ -1,10 +1,5 @@ FROM mariadb -ENV MARIADB_USER bibliosearch -ENV MARIADB_PASSWORD bibliosearch_password -ENV MARIADB_DATABASE bibliosearch -ENV MARIADB_ROOT_PASSWORD pass - COPY 01_overlap_defs.sql /docker-entrypoint-initdb.d/ COPY 02_empty_and_reload_overlap.sh /docker-entrypoint-initdb.d/ -COPY overlap_umich.tsv / +COPY overlap.tsv / diff --git a/umich_catalog_indexing/scratch/.keep b/umich_catalog_indexing/scratch/.keep new file mode 100644 index 00000000..e69de29b diff --git a/umich_catalog_indexing/spec/support/traject_settings.rb b/umich_catalog_indexing/spec/support/traject_settings.rb index f928b288..ef4ca17d 100644 --- a/umich_catalog_indexing/spec/support/traject_settings.rb +++ b/umich_catalog_indexing/spec/support/traject_settings.rb @@ -1,32 +1,32 @@ $:.unshift "#{File.dirname(__FILE__)}/../lib" -require 'library_stdnums' +require "services" +require "library_stdnums" -require 'traject/macros/marc21' +require "traject/macros/marc21" extend Traject::Macros::Marc21 -require 'traject/macros/marc21_semantics' +require "traject/macros/marc21_semantics" extend Traject::Macros::Marc21Semantics -require 'traject/macros/marc_format_classifier' +require "traject/macros/marc_format_classifier" extend Traject::Macros::MarcFormats -require 'ht_traject' +require "ht_traject" extend HathiTrust::Traject::Macros extend Traject::UMichFormat::Macros require "traject/null_writer" -require 'marc_record_speed_monkeypatch' -require 'marc4j_fix' - -UmichOverlap = if ENV['NODB'] - require "ht_traject/no_db_mocks/ht_overlap" - HathiTrust::NoDB::UmichOverlap - else - require 'ht_traject/ht_overlap.rb' - HathiTrust::UmichOverlap - end +require "marc_record_speed_monkeypatch" +require "marc4j_fix" +UmichOverlap = if S.no_db? + require "ht_traject/no_db_mocks/ht_overlap" + HathiTrust::NoDB::UmichOverlap +else + require "ht_traject/ht_overlap" + HathiTrust::UmichOverlap +end settings do store "writer_class_name", "Traject::NullWriter" diff --git a/umich_catalog_indexing/writers/solr.rb b/umich_catalog_indexing/writers/solr.rb index e5db5e1a..4252f606 100644 --- a/umich_catalog_indexing/writers/solr.rb +++ b/umich_catalog_indexing/writers/solr.rb @@ -1,16 +1,16 @@ -require 'traject' -require 'traject/solr_json_writer' +$:.unshift "#{File.dirname(__FILE__)}/../lib" +require "traject" +require "traject/solr_json_writer" +require "services" settings do - provide "solr_writer.basic_auth_user", ENV.fetch("SOLR_USER") if ENV.fetch("SOLRCLOUD_ON") == "true" - provide "solr_writer.basic_auth_password", ENV.fetch("SOLR_PASSWORD") if ENV.fetch("SOLRCLOUD_ON") == "true" + provide "solr_writer.basic_auth_user", S.solr_user if S.solrcloud_on? + provide "solr_writer.basic_auth_password", S.solr_password if S.solrcloud_on? provide "solr_writer.max_skipped", 1000 provide "solr_writer.commit_on_close", "true" - provide "solr_writer.thread_pool", 2 + provide "solr_writer.thread_pool", S.solr_threads provide "solr_writer.batch_size", 60 provide "writer_class_name", "Traject::SolrJsonWriter" - store "processing_thread_pool", 8 + store "processing_thread_pool", S.processing_threads provide "log.batch_size", 50_000 end - -