Skip to content

Commit

Permalink
Merge pull request #2499 from herwinw/windows1254
Browse files Browse the repository at this point in the history
Add Windows-1254 encoding
  • Loading branch information
herwinw committed Jan 13, 2025
2 parents 0d9d013 + 899222d commit 08a5ae2
Show file tree
Hide file tree
Showing 8 changed files with 111 additions and 6 deletions.
1 change: 1 addition & 0 deletions include/natalie.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
#include "natalie/encoding/windows1251_encoding_object.hpp"
#include "natalie/encoding/windows1252_encoding_object.hpp"
#include "natalie/encoding/windows1253_encoding_object.hpp"
#include "natalie/encoding/windows1254_encoding_object.hpp"
#include "natalie/encoding/windows1258_encoding_object.hpp"

#include "natalie/encoding_object.hpp"
Expand Down
18 changes: 18 additions & 0 deletions include/natalie/encoding/windows1254_encoding_object.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include <assert.h>
#include <initializer_list>

#include "natalie/encoding/single_byte_encoding_object.hpp"
#include "natalie/string_object.hpp"

namespace Natalie {

using namespace TM;

class Windows1254EncodingObject : public SingleByteEncodingObject {
public:
Windows1254EncodingObject();
};

}
5 changes: 3 additions & 2 deletions include/natalie/encodings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

namespace Natalie {

const size_t EncodingCount = 46;
const size_t EncodingCount = 47;

enum class Encoding {
NONE = 0,
Expand Down Expand Up @@ -53,7 +53,8 @@ enum class Encoding {
Windows_1251 = 43,
Windows_1252 = 44,
Windows_1253 = 45,
Windows_1258 = 46,
Windows_1254 = 46,
Windows_1258 = 47,
};

}
4 changes: 1 addition & 3 deletions spec/core/string/valid_encoding_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,7 @@
NATFIXME 'Implement Windows-1255 encoding', exception: ArgumentError do
str.force_encoding('Windows-1255').valid_encoding?.should be_true
end
NATFIXME 'Implement Windows-1254 encoding', exception: ArgumentError do
str.force_encoding('Windows-1254').valid_encoding?.should be_true
end
str.force_encoding('Windows-1254').valid_encoding?.should be_true
NATFIXME 'Implement TIS-620 encoding', exception: ArgumentError do
str.force_encoding('TIS-620').valid_encoding?.should be_true
end
Expand Down
24 changes: 24 additions & 0 deletions src/encoding/windows1254_encoding_object.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#include "natalie.hpp"

namespace Natalie {

static const long WINDOWS1254[] = {
0x20AC, -1, 0x201A, 0x192, 0x201E, 0x2026, 0x2020, 0x2021, 0x2C6, 0x2030,
0x160, 0x2039, 0x152, -1, -1, -1, -1, 0x2018, 0x2019, 0x201C,
0x201D, 0x2022, 0x2013, 0x2014, 0x2DC, 0x2122, 0x161, 0x203A, 0x153, -1,
-1, 0x178, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1,
0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB,
0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5,
0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
0x11E, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9,
0xDA, 0xDB, 0xDC, 0x130, 0x15E, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED,
0xEE, 0xEF, 0x11F, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x131, 0x15F, 0xFF
};

Windows1254EncodingObject::Windows1254EncodingObject()
: SingleByteEncodingObject { Encoding::Windows_1254, { "Windows-1254", "CP1254" }, WINDOWS1254 } { }

}
3 changes: 3 additions & 0 deletions src/natalie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,9 @@ Env *build_top_env() {
Value EncodingWindows1253 = new Windows1253EncodingObject {};
Encoding->const_set("Windows_1253"_s, EncodingWindows1253);
Encoding->const_set("CP1253"_s, EncodingWindows1253);
Value EncodingWindows1254 = new Windows1254EncodingObject {};
Encoding->const_set("Windows_1254"_s, EncodingWindows1254);
Encoding->const_set("CP1254"_s, EncodingWindows1254);
Value EncodingWindows1258 = new Windows1258EncodingObject {};
Encoding->const_set("Windows_1258"_s, EncodingWindows1258);
Encoding->const_set("CP1258"_s, EncodingWindows1258);
Expand Down
2 changes: 1 addition & 1 deletion src/regexp_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ static const auto ruby_encoding_lookup = []() {
map.put(ONIG_ENCODING_WINDOWS_1251, Encoding::Windows_1251);
map.put(ONIG_ENCODING_WINDOWS_1252, Encoding::Windows_1252);
map.put(ONIG_ENCODING_WINDOWS_1253, Encoding::Windows_1253);
// ONIG_ENCODING_WINDOWS_1254 has no local encoding
map.put(ONIG_ENCODING_WINDOWS_1254, Encoding::Windows_1254);
// ONIG_ENCODING_WINDOWS_1257 has no local encoding
// ONIG_ENCODING_BIG5 has no local encoding
// ONIG_ENCODING_GB18030 has no local encoding
Expand Down
60 changes: 60 additions & 0 deletions test/natalie/encoding_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1067,4 +1067,64 @@
end
end
end

describe 'Windows-1254' do
it 'can convert codepoints' do
[
0x61,
0x8D,
0xFF,
].each do |codepoint|
codepoint.chr(Encoding::Windows_1254).ord.should == codepoint
end
end

it 'can convert to UTF-8' do
{
0x61 => 0x61,
0xD0 => 0x11E,
0xFF => 0xFF,
}.each do |codepoint, expected|
codepoint.chr(Encoding::Windows_1254).encode(Encoding::UTF_8).ord.to_s(16).should == expected.to_s(16)
end
end

it 'cannot convert certain codepoints to UTF-8' do
[
0x81,
0x8D,
0x8E,
0x8F,
0x90,
0x9D,
0x9E,
].each do |codepoint|
-> { codepoint.chr(Encoding::Windows_1254).encode(Encoding::UTF_8) }.should raise_error(Encoding::UndefinedConversionError, /from Windows-1254 to UTF-8/)
end
end

it 'can convert from UTF-8' do
{
0x61 => 0x61,
0x11E => 0xD0,
0xFF => 0xFF,
}.each do |codepoint, expected|
codepoint.chr(Encoding::UTF_8).encode(Encoding::Windows_1254).ord.to_s(16).should == expected.to_s(16)
end
end

it 'can chop a character (this uses EncodingObject::prev_char)' do
[
0x61,
0x8C,
0xFF,
].each do |codepoint|
string = 'a'.encode(Encoding::Windows_1254) + codepoint.chr(Encoding::Windows_1254)
string.encoding.should == Encoding::Windows_1254
string.chop!
string.encoding.should == Encoding::Windows_1254
string.bytes.should == 'a'.encode(Encoding::Windows_1254).bytes
end
end
end
end

0 comments on commit 08a5ae2

Please sign in to comment.