Skip to content

Commit

Permalink
moved around the dependent db definitions
Browse files Browse the repository at this point in the history
  • Loading branch information
niquerio committed Mar 19, 2024
1 parent be02f85 commit d3e571d
Show file tree
Hide file tree
Showing 19 changed files with 148 additions and 136 deletions.
19 changes: 9 additions & 10 deletions compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ services:
environment:
- REDIS_URL=redis://redis:6379
- HLB_XML_ENDPOINT=https://apps.lib.umich.edu/browse/categories/xml.php
- NODB=1
env_file:
- ./umich_catalog_indexing/.env
- ./umich_catalog_indexing/env.development
Expand Down Expand Up @@ -100,15 +99,15 @@ services:
env_file:
- ./mock-ht/env.development

#hathifiles:
#image: mariadb
#volumes:
#- hathidb:/var/lib/mysql
#environment:
#- MARIADB_ROOT_PASSWORD=pass
#- MARIADB_USER=user
#- MARIADB_PASSWORD=password
#- MARIADB_DATABASE=hathifiles
hathifiles:
image: mariadb
volumes:
- hathidb:/var/lib/mysql
environment:
- MARIADB_ROOT_PASSWORD=pass
- MARIADB_USER=user
- MARIADB_PASSWORD=password
- MARIADB_DATABASE=hathifiles

hathioverlap:
build: overlap/.
Expand Down
14 changes: 14 additions & 0 deletions umich_catalog_indexing/bin/index_json_sample.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/local/bin/ruby
$LOAD_PATH << "/app/lib"
require "sidekiq_jobs"
if ["-h","--help"].include?(ARGV[0])
puts <<-USAGE
description: indexes a `json.gz` file of marc records; needs `docker compose up`
usage: bundle exec index_sample.rb [file_basename]
file_basename: sftp/search_daily_bibs/[file_basename].json.gz to be indexed.
If not provided uses sftp/search_daily_bibs/zephir_pd_20220301.json.gz
USAGE
return
end
basename = ARGV[0] || "zephir_upd_20220301"
IndexJson.new.perform("search_daily_bibs/#{basename}.json.gz", "http://solr:8983/solr/biblio")
5 changes: 4 additions & 1 deletion umich_catalog_indexing/bin/sftp
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#!/bin/bash

sftp -oIdentityFile=${SSH_KEY_PATH} -oStrictHostKeyChecking=no ${ALMA_FILES_USER}@${ALMA_FILES_HOST}
ssh_key_path=${SSH_KEY_PATH-"/etc/secret-volume/id_rsa"}
sftp_user=${ALMA_FILES_USER-"alma"}
sftp_host=${ALMA_FILES_HOST-"sftp"}
sftp -oIdentityFile=${ssh_key_path} -oStrictHostKeyChecking=no ${sftp_user}@${sftp_host}
12 changes: 1 addition & 11 deletions umich_catalog_indexing/env.development
Original file line number Diff line number Diff line change
@@ -1,15 +1,4 @@
HATHIFILE_HOST=hathidb
HATHIFILE_DB=hathifiles
HATHIFILE_USER=root
HATHIFILE_PASSWORD=pass
HATHI_OVERLAP_HOST=hathioverlap
HATHI_OVERLAP_DB=overlap
HATHI_OVERLAP_USER=root
HATHI_OVERLAP_PASSWORD=pass
ALMA_API_HOST=https://api-na.hosted.exlibrisgroup.com
ALMA_FILES_USER=alma
ALMA_FILES_HOST=sftp
SSH_KEY_PATH=/etc/secret-volume/id_rsa
DAILY_ALMA_FILES_PATH="search_daily_bibs"
MACC_PRODUCTION_SOLR_URL=http://solr:8983/solr/biblio
HATCHER_PRODUCTION_SOLR_URL=http://solr:8983/solr/biblio
Expand All @@ -20,3 +9,4 @@ SOLR_USER=solr
SOLR_PASSWORD=SolrRocks
SOLRCLOUD_ON=true
PROMETHEUS_PUSH_GATEWAY=http://pushgateway:9091
HLB_XML_ENDPOINT="https://apps.lib.umich.edu/browse/categories/xml.php"
1 change: 1 addition & 0 deletions umich_catalog_indexing/env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
ALMA_API_KEY='YOUR_ALMA_API_KEY'
NODB=1
#REINDEX=1
SUPERVISOR_ON='true'
7 changes: 4 additions & 3 deletions umich_catalog_indexing/indexers/common.rb
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@

to_field "record_source", record_source # set to alma or zephir, based on record id

talk_to_overlap = []
# for zephir records, check umich print holdings overlap file--skip if oclc number is found in file
# for zephir records, check umich print holdings overlap file--skip if oclc number is found in file; Only do this for the dailies.
#
talk_to_overlap = Concurrent::Array.new
each_record do |rec, context|
# context.clipboard[:ht][:overlap] = UmichOverlap.get_overlap(oclc_nums) # returns count of all records found (:count_all), and access=deny records (:count_etas)
if context.clipboard[:ht][:record_source] == "zephir"
Expand All @@ -59,7 +60,7 @@
end
if talk_to_overlap.size % 1000 == 0
avg = (talk_to_overlap.last(1000).sum(0.0) / 1000) * 1000
S.logger.info "avg time talking to overlap: #{avg}"
S.logger.info "avg time talking to overlap: #{avg}" if avg > 0
end
end

Expand Down
29 changes: 12 additions & 17 deletions umich_catalog_indexing/indexers/settings.rb
Original file line number Diff line number Diff line change
@@ -1,36 +1,32 @@
$:.unshift "#{File.dirname(__FILE__)}/../lib"
require 'set'
require "set"
require "services"

require 'library_stdnums'
require "library_stdnums"

require 'traject/macros/marc21_semantics'
require "traject/macros/marc21_semantics"
extend Traject::Macros::Marc21Semantics

require 'traject/macros/marc_format_classifier'
require "traject/macros/marc_format_classifier"
extend Traject::Macros::MarcFormats

require 'ht_traject'
require "ht_traject"
extend HathiTrust::Traject::Macros
extend Traject::UMichFormat::Macros

require 'marc/fastxmlwriter'
require "marc/fastxmlwriter"

require 'marc_record_speed_monkeypatch'
require 'marc4j_fix'
require "marc_record_speed_monkeypatch"
require "marc4j_fix"

UmichOverlap = if ENV['NODB']
require "ht_traject/no_db_mocks/ht_overlap"
HathiTrust::NoDB::UmichOverlap
else
require 'ht_traject/ht_overlap.rb'
HathiTrust::UmichOverlap
end
UmichOverlap = S.overlap_klass
S.logger.info "Using #{UmichOverlap}"
S.logger.info "Using #{S.hathifiles_klass}"

settings do
store "log.batch_progress", 10_000
end


logger.info RUBY_DESCRIPTION

################################
Expand All @@ -39,4 +35,3 @@

# Set up an area in the clipboard for use storing intermediate stuff
each_record HathiTrust::Traject::Macros.setup

19 changes: 5 additions & 14 deletions umich_catalog_indexing/indexers/umich_alma.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,7 @@
# require 'ht_traject/ht_overlap.rb'
require "json"
require "umich_traject/floor_location"

HathiFiles = if ENV["NODB"]
require "ht_traject/no_db_mocks/ht_hathifiles"
HathiTrust::NoDB::HathiFiles
else
require "ht_traject/ht_hathifiles"
HathiTrust::HathiFiles
end
HathiFiles = S.hathifiles_klass
libLocInfo = Traject::TranslationMap.new("umich/libLocInfo")
electronic_collections = Traject::TranslationMap.new("umich/electronic_collections")

Expand Down Expand Up @@ -41,13 +34,11 @@
end

cc_to_of = Traject::TranslationMap.new("ht/collection_code_to_original_from")
talk_to_hathi = []
talk_to_hathi = Concurrent::Array.new
each_record do |r, context|
locations = []
inst_codes = []
availability = []
sh = {}
has_e56 = false
id = context.output_hash["id"]

# "OWN" field
Expand Down Expand Up @@ -134,9 +125,9 @@
context.clipboard[:ht][:availability] = availability.compact.uniq.sort
context.clipboard[:ht][:locations] = locations.compact.uniq.sort
context.clipboard[:ht][:inst_codes] = inst_codes.compact.uniq.sort
if talk_to_hathi.size % 1000 == 0
avg = (talk_to_hathi.last(1000).sum(0.0) / 1000) * 1000
# S.logger.info "avg time talking to hathifiles: #{avg}"
if talk_to_hathi.size % 100 == 0
avg = (talk_to_hathi.last(100).sum(0.0) / 100) * 1000
S.logger.info "avg time talking to hathifiles: #{avg}" if avg > 0
end
end

Expand Down
7 changes: 7 additions & 0 deletions umich_catalog_indexing/lib/config/sftp_config.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
require "sftp"
require_relative "../services"
SFTP.configure do |config|
config.user = S.sftp_user
config.host = S.sftp_host
config.key_path = S.ssh_key_path
end
11 changes: 5 additions & 6 deletions umich_catalog_indexing/lib/ht_traject.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
require 'forwardable'
require 'ht_traject/ht_macros'
require 'ht_traject/ht_item'
require 'traject/umich_format'
require 'ht_traject/fast_xmlwriter'

require "forwardable"
require "ht_traject/ht_macros"
require "ht_traject/ht_item"
require "traject/umich_format"
require "ht_traject/fast_xmlwriter"
37 changes: 0 additions & 37 deletions umich_catalog_indexing/lib/ht_traject/ht_dbh.rb

This file was deleted.

38 changes: 16 additions & 22 deletions umich_catalog_indexing/lib/ht_traject/ht_hathifiles.rb
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
require 'traject'
require_relative 'ht_dbh'
require 'sequel'
require "traject"
require "sequel"
require_relative "../services"

module HathiTrust

class HathiFiles
DB = HathiTrust::DBH::DB
DB = S.hathifiles_mysql

SELECTED_COLS = [
Sequel[:hf][:htid].as(:id),
Expand All @@ -15,49 +14,44 @@ class HathiFiles
:access
]

CC_TO_OF = ::Traject::TranslationMap.new('ht/collection_code_to_original_from')

CC_TO_OF = ::Traject::TranslationMap.new("ht/collection_code_to_original_from")

# Note how for both oclc_nums and bibs we need to map everything to strings,
# since the database stores those values as strings. Confusingly, you'll get the
# right answer if you send ints becauyse mysql will silently change them, but it
# will then refuse to use the indexes!
def self.oclc_query(oclc_nums)

oclc_nums.map!{|num| num.to_i}
oclc_nums.map! { |num| num.to_i }
oclc_join = DB[:hf].join(:hf_oclc, htid: :htid)
hf_htid = Sequel[:hf][:htid]
oclc_join.select(*SELECTED_COLS).
where(value: Array(oclc_nums).map(&:to_s))
oclc_join.select(*SELECTED_COLS)
.where(value: Array(oclc_nums).map(&:to_s))
end

def self.bib_query(bib_nums)
bib_join = DB[:hf].join(:hf_source_bib, htid: :htid)
bib_join.select(*SELECTED_COLS).
where(source: 'MIU').
where(value: Array(bib_nums).map(&:to_s))
bib_join.select(*SELECTED_COLS)
.where(source: "MIU")
.where(value: Array(bib_nums).map(&:to_s))
end

def self.query(bib_nums:, oclc_nums:)
self.bib_query(bib_nums).union(self.oclc_query(oclc_nums))
bib_query(bib_nums).union(oclc_query(oclc_nums))
end

#DB.logger = Logger.new($stdout)
# DB.logger = Logger.new($stdout)
# I use a db driver per thread to avoid any conflicts
def self.get_hf_info(oclc_nums, bib_nums)
oclc_nums = Array(oclc_nums)
bib_nums = Array(bib_nums)
hf_hash = Hash.new
hf_hash = {}

self.query(bib_nums: bib_nums, oclc_nums: oclc_nums).each do |r|
query(bib_nums: bib_nums, oclc_nums: oclc_nums).each do |r|
hf_hash[r[:id]] = r
hf_hash[r[:id]]['source'] = CC_TO_OF[r[:collection_code].downcase]
hf_hash[r[:id]]["source"] = CC_TO_OF[r[:collection_code].downcase]
end

hf_hash.values
end

end
end


4 changes: 2 additions & 2 deletions umich_catalog_indexing/lib/ht_traject/ht_overlap.rb
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
require "traject"
require_relative "ht_dbh"
require "sequel"
require_relative "../services"

module HathiTrust
class UmichOverlap
DB = HathiTrust::DBH_overlap::DB
DB = S.overlap_mysql
Umich_overlap_query = DB[:overlap].select(:access)

# I use a db driver per thread to avoid any conflicts
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,12 @@
module HathiTrust
module NoDB
class UmichOverlap

def self.get_overlap(oclc_nums)
{
count_all: 0,
count_etas: 0
}
end

end
end
end
end
2 changes: 1 addition & 1 deletion umich_catalog_indexing/lib/jobs/index_alma_xml.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def initialize(file:, solr_url:, logger: Logger.new($stdout),
end

def run
@logger.info "fetching #{@file} from #{ENV.fetch("ALMA_FILES_HOST")}"
@logger.info "fetching #{@file} from #{S.sftp_host}"
@alma_file_processor.run

@translation_map_generator.generate_all
Expand Down
2 changes: 1 addition & 1 deletion umich_catalog_indexing/lib/jobs/index_json.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def initialize(file:, solr_url:, logger: S.logger,
end

def run
@logger.info "fetching #{@file} from #{ENV.fetch("ALMA_FILES_HOST")}"
@logger.info "fetching #{@file} from #{S.sftp_host}"
@file_processor.run

@translation_map_generator.generate_all
Expand Down
Loading

0 comments on commit d3e571d

Please sign in to comment.