Skip to content

Commit

Permalink
replace half diacritics U+FE20, U+FE21 with full diacritic U+0361
Browse files Browse the repository at this point in the history
  • Loading branch information
dnoneill committed Mar 1, 2024
1 parent f9d186c commit 930900e
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 1 deletion.
8 changes: 7 additions & 1 deletion lib/traject/readers/folio_postgres_reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ def each
break if response.nil?

response.each do |row|
data = JSON.parse(row['jsonb_build_object'])
jsonb_build_object = Traject::FolioPostgresReader.encoding_cleanup(row['jsonb_build_object'])
data = JSON.parse(jsonb_build_object)

merge_separately_queried_data!(data)

Expand All @@ -73,6 +74,11 @@ def each
end
end

def self.encoding_cleanup(row)
# cleans up cyrlic encoding i︠a︡ to i͡a
row.gsub(/[?=\ufe20](.{1,2})[?<=\ufe21]/, "\u0361\\1")
end

def sql_server_current_time
Time.parse(@connection.exec('SELECT NOW()').getvalue(0, 0))
end
Expand Down
23 changes: 23 additions & 0 deletions spec/lib/traject/readers/folio_postgres_reader_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,27 @@
expect(result).to be_a FolioRecord
end
end

context 'check encoding parsing' do
let(:encoded_string) do
encoding_sample_json = JSON.generate({
"title1" => "Strategii︠a︡ planirovanii︠a︡ izbiratelʹnoĭ kampanii",
"title2" => "Unencoded string",
"title3" => "Strategiii︠a planirovaniia︡ izbiratelʹnoĭ kampanii"
})
JSON.parse(described_class.encoding_cleanup(encoding_sample_json))
end

it 'encodes cyrilic correctly' do
expect(encoded_string['title1']).to eq('Strategii͡a planirovanii͡a izbiratelʹnoĭ kampanii')
end

it 'returns unencoded string without change' do
expect(encoded_string['title2']).to eq('Unencoded string')
end

it 'returns encoded string without change' do
expect(encoded_string['title3']).to eq('Strategiii︠a planirovaniia︡ izbiratelʹnoĭ kampanii')
end
end
end

0 comments on commit 930900e

Please sign in to comment.