From a1af7586ba0e31479138d496ad7a7a6de1a7a5fc Mon Sep 17 00:00:00 2001 From: dnoneill Date: Fri, 1 Mar 2024 10:53:42 -0500 Subject: [PATCH] replace half diacritics U+FE20, U+FE21 with full diacritic U+0361 --- lib/traject/readers/folio_postgres_reader.rb | 8 ++++++- .../readers/folio_postgres_reader_spec.rb | 21 +++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/lib/traject/readers/folio_postgres_reader.rb b/lib/traject/readers/folio_postgres_reader.rb index ec2f9579b..50b6c6ebe 100644 --- a/lib/traject/readers/folio_postgres_reader.rb +++ b/lib/traject/readers/folio_postgres_reader.rb @@ -59,7 +59,8 @@ def each break if response.nil? response.each do |row| - data = JSON.parse(row['jsonb_build_object']) + jsonb_build_object = Traject::FolioPostgresReader.encoding_cleanup(row['jsonb_build_object']) + data = JSON.parse(jsonb_build_object) merge_separately_queried_data!(data) @@ -73,6 +74,11 @@ def each end end + def self.encoding_cleanup(row) + # cleans up cyrlic encoding i︠a︡ to i͡a + row.gsub(/[?=\ufe20](.{1,2})[?<=\ufe21]/, "\u0361\\1") + end + def sql_server_current_time Time.parse(@connection.exec('SELECT NOW()').getvalue(0, 0)) end diff --git a/spec/lib/traject/readers/folio_postgres_reader_spec.rb b/spec/lib/traject/readers/folio_postgres_reader_spec.rb index 2d9bbd090..227f8b8dd 100644 --- a/spec/lib/traject/readers/folio_postgres_reader_spec.rb +++ b/spec/lib/traject/readers/folio_postgres_reader_spec.rb @@ -71,4 +71,25 @@ expect(result).to be_a FolioRecord end end + + context 'check encoding parsing' do + encoding_sample_str_json = JSON.generate({ 'title1' => 'Strategii︠a︡ planirovanii︠a︡ izbiratelʹnoĭ kampanii', + 'title2' => 'Unencoded string', + 'title3' => 'Strategiii︠a planirovaniia︡ izbiratelʹnoĭ kampanii' }) + let(:encoded_string) do + JSON.parse(described_class.encoding_cleanup(encoding_sample_str_json)) + end + + it 'encodes cyrilic correctly' do + expect(encoded_string['title1']).to eq('Strategii͡a planirovanii͡a izbiratelʹnoĭ kampanii') + end + + it 'returns unencoded string without change' do + expect(encoded_string['title2']).to eq('Unencoded string') + end + + it 'returns encoded string without change' do + expect(encoded_string['title3']).to eq('Strategiii︠a planirovaniia︡ izbiratelʹnoĭ kampanii') + end + end end