-
Notifications
You must be signed in to change notification settings - Fork 4
/
string19.rb
executable file
·1944 lines (1532 loc) · 58.9 KB
/
string19.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env ruby19
# encoding: UTF-8
# This document is Copyright (C) Brian Candler 2009 and released under a
# Creative Commons Attribution-NonCommercial 3.0 Unported License.
############# CONTENTS ###################
# -1. PREAMBLE
# 0. INTRODUCTION
# 1. ENCODINGS
# 2. PROPERTIES OF ENCODINGS
# 3. STRING, FILE AND REGEXP ENCODINGS
# 4. VALID AND FIXED ENCODINGS
# 5. COMPATIBLE OBJECTS
# 6. STRING CONCATENATION
# 7. THE BINARY / ASCII-8BIT ENCODING
# 8. SINGLE CHARACTERS
# 9. EQUALITY AND COLLATION
# 10. HASH AND EQL?
# 11. UPPER AND LOWER CASE
# 12. REGULAR EXPRESSIONS
# 13. FROZEN STRINGS
# 14. OTHER METHODS ON STRING
# 15. OTHER METHODS WHICH TAKE STRING ARGUMENTS
# 16. OTHER METHODS WHICH RETURN STRINGS
# 17. LIBRARY METHODS
# 18. SOURCE ENCODING
# 19. STRING LITERALS
# 20. EXTERNAL AND INTERNAL ENCODING
############# -1. PREAMBLE ###############
# This file is runnable documentation. It runs for me under Ubuntu Lucid
# using ruby-1.9.2-p0 and ruby-1.9.1-p429 compiled from source, with my
# default en_GB.utf8 locale. I believe it will run on other systems, but it
# may not, given that Ruby's String behaviour is sensitive to the
# environment in which it is run.
# The following code is just for setting up the test cases.
require 'rbconfig'
RUBY=Config::CONFIG['ruby_install_name']
TMPFILE="test-ruby19"
require 'test/unit'
class TestString < Test::Unit::TestCase
alias :is :assert_equal
def test_string19
############# 0. INTRODUCTION ####################
# This article attempts to define, in a reasonable level of detail, the M17N
# properties in ruby 1.9 and how they affect the execution of a program.
# With ruby 1.8, if you weren't working with M17N then you could ignore it.
# But with ruby 1.9, even if you are working with binary data like JPEGs or
# PDFs or ASN1 certificates, you need to understand how String has been
# changed, otherwise your program may fail depending on exactly how and
# where it is run.
# There is other documentation on this subject, most notably James Edward
# Gray II's series starting at
# http://blog.grayproductions.net/articles/ruby_19s_string
# but everything I read only raised more questions, so I wrote this
# document as I experimented with the behaviour.
# What I have written here has been determined by reverse-engineering, that
# is, trial-and-error testing and looking at the ruby 1.9 C source. This is
# because there is no official documentation of the expected behaviour of
# String in the 1.9 world. I believe that what I have written is true, and
# I've tried to provide runnable examples of each aspect of the behaviour,
# but I also know that it is far from complete.
# Other documents of note:
# http://yokolet.blogspot.com/2009/07/design-and-implementation-of-ruby-m17n.html
# http://ruby.runpaint.org/encoding
############# 1. ENCODINGS #######################
# An "encoding" is a character set combined with a way of representing each
# of those characters as sequences of bytes. Ruby 1.9 comes with a large
# set of predefined encodings with an instance of the Encoding class
# representing each one. They are all constants under the Encoding
# namespace, and you can get a list of them all using Encoding.list
is Encoding,
Encoding::UTF_8.class
# Each Encoding also has a string name. You can get a list of all names
# using Encoding.name_list, and convert between string names and Encoding
# objects using Encoding.find and Encoding#to_s
is Encoding::ASCII_8BIT,
Encoding.find("ASCII-8BIT")
is "ASCII-8BIT",
Encoding::ASCII_8BIT.to_s
# An encoding may have more than one name, in which case only the primary
# one is returned by Encoding#to_s, but any of the names can be used in
# Encoding.find. All the names are returned by Encoding#names, with the
# primary one first. There is also an Encoding.aliases hash.
is Encoding::ASCII_8BIT,
Encoding.find("BINARY")
is "ASCII-8BIT",
Encoding::BINARY.to_s
# NOTE: 1.9.2 gives me ["ASCII-8BIT", "BINARY"]
# but 1.9.1 gives me ["ASCII-8BIT", "BINARY", "filesystem"]
assert Encoding::ASCII_8BIT.names.include? "ASCII-8BIT"
assert Encoding::ASCII_8BIT.names.include? "BINARY"
is "ASCII-8BIT",
Encoding.aliases["BINARY"]
is nil,
Encoding.aliases["ASCII-8BIT"]
# In general, all the methods which take an Encoding will also take
# a String which is the encoding name, meaning that you don't have to call
# Encoding.find yourself.
# The set of aliases for each encoding is not static, since there are
# special aliases like "external" and "locale" which are reassigned
# dynamically. e.g. in irb
# >> Encoding::UTF_8.names
# => ["UTF-8", "CP65001", "locale", "external"]
# >> Encoding::ISO_8859_1.names
# => ["ISO-8859-1", "ISO8859-1"]
# >> Encoding.default_external = "ISO-8859-1"
# => "ISO-8859-1"
# >> Encoding::UTF_8.names
# => ["UTF-8", "CP65001", "locale"]
# >> Encoding.default_external = "ISO-8859-1"
# => "ISO-8859-1"
# We will come back to this later.
# Encoding#names and Encoding.aliases are not frozen, so you can modify
# them to add additional aliases if you wish.
# The name of the character set for the system's "locale" is available in
# Encoding.locale_charmap. Note that this is a String, not an Encoding.
is String,
Encoding.locale_charmap.class # e.g. "UTF-8"
# The choice of locale is made at runtime based on environment variables,
# see setlocale(3)
res = %x{env LC_ALL=en_US.utf8 #{RUBY} -e "puts Encoding.locale_charmap"}.chomp
is "UTF-8", res
# Quoting from the Encoding.locale_charmap documentation: "The result is
# highly platform dependent. So Encoding.find(Encoding.locale_charmap) may
# cause an error. If you need some encoding object even for unknown locale,
# Encoding.find("locale") can be used."
# On Linux systems, the 'C' locale maps to the ANSI_X3.4-1968 character set.
res = %x{env LC_ALL=C #{RUBY} -e "puts Encoding.locale_charmap"}.chomp
if res != "ANSI_X3.4-1968"
STDERR.puts "WARNING: got #{res.inspect} as locale_charmap for LC_ALL=C"
end
# On systems which have no locale support at all, the fallback is US-ASCII.
# This may be the case with Cygwin, or at least was in the past.
############# 2. PROPERTIES OF ENCODINGS ##########
# If an encoding includes single-byte ASCII characters in the range 00-7F,
# then it is said to be ASCII-compatible.
#
# Most encodings are in fact ASCII-compatible. An example of one which is not
# is Encoding::UTF_16BE (because all characters are two bytes)
#
# In ruby 1.9.2, this property is directly exposed:
if RUBY_VERSION >= "1.9.2"
is true,
Encoding::UTF_8.ascii_compatible?
end
# In 1.9.1 and earlier you had to test for it indirectly, e.g.
#
# Encoding.list.find_all { |e|
# !Encoding.compatible?("a".force_encoding(e), "a")
# }
#
# There are also encodings which are tagged as 'dummy', which is a property
# you can test for using Encoding#dummy?
is true,
Encoding::ISO_2022_JP.dummy?
# According to a comment in the source, "A dummy encoding is an encoding for
# which character handling is not properly implemented. It is used for
# stateful encodings." (That is, encodings with 'shift' sequences which
# means that the interpretation of a character depends on characters which
# have preceeded it)
# REFERENCE: macro rb_enc_asciicompat() in include/ruby/encoding.h
############# 3. STRING, FILE AND REGEXP PROPERTIES #################
# Strings, Regexps and File/IO objects have encoding properties. Two new
# properties have been introduced:
#
# 'encoding': this points to an Encoding object, and labels a string as
# being built from a particular character set. This property can be set
# explicitly, and can also change automatically when you append characters
# to it.
#
# 'ascii_only?': this is a boolean property of the *content* of a String,
# and is set dynamically to represent whether the string contains *only* all
# bytes with the top bit set to zero (i.e. values in the range 00-7F).
# Appending or removing characters from a string can change this property.
# It is false if the Encoding is not ASCII-compatible.
# 3.1 Strings
#
# Strings have both 'encoding' and 'ascii_only?' properties. I will go into
# detail later about how a string picks up its initial encoding, but for now
# notice that string literals in this file get UTF-8 because of the line
# "#encoding:UTF-8" at the top of this file.
str = "hello"
is Encoding::UTF_8,
str.encoding
is true,
str.ascii_only?
str = "groß"
is Encoding::UTF_8,
str.encoding
is false,
str.ascii_only?
# For ASCII-compatible encodings, the empty string has ascii_only? true
str = ""
is true,
str.ascii_only?
# However for non-ASCII-compatible encodings, e.g. "wide" encodings where
# all characters are 2 or more bytes, ascii_only? is always false - even
# for the empty string, or for strings which consist only of characters in
# the 0000-007F range.
str = "".force_encoding("UTF-16BE")
is false,
str.ascii_only?
str = "A".encode!("UTF-16BE")
is false,
str.ascii_only?
# In ruby 1.9, the ascii_only? property is cached to avoid having to
# recompute it all the time. This means that the interpreter must be very
# careful to clear this cache after any string change which might invalidate
# it, otherwise bad things happen (like String#hash or String#eql? giving
# the wrong results)
# The encoding of a string can be changed to any known encoding using
# 'force_encoding'. This does not change the content of the string at all,
# just its encoding tag. It always returns the same string, not a copy.
str = "groß"
is [103, 114, 111, 195, 159],
str.bytes.to_a
str.force_encoding("ISO8859-1")
is Encoding::ISO8859_1,
str.encoding
is [103, 114, 111, 195, 159],
str.bytes.to_a
# Encodings can be specified using their name in string form, or a
# predefined constant under the Encoding module.
str = "hello"
assert_nothing_raised {
str.force_encoding "UTF-8"
str.force_encoding Encoding::UTF_8
}
# To transcode a string, use the 'encode' or 'encode!' methods. This gives a
# string with the characters re-encoded for the target character set. The
# former returns a new string, and the latter updates the source string.
str = "groß"
str.encode!("ISO8859-1")
is Encoding::ISO8859_1,
str.encoding
is [103, 114, 111, 223],
str.bytes.to_a
# Normally this will raise an error if the source string contains an
# invalid character, or a source character isn't available in the target
# character set
str = "hello\xff"
err = assert_raises(Encoding::InvalidByteSequenceError) {
str.encode!("ISO8859-1")
}
str = "hello\u0100"
err = assert_raises(Encoding::UndefinedConversionError) {
str.encode!("ISO8859-1")
}
# However there are options you can apply which will override this
# behaviour - see 'ri String#encode' and 'ri Encoding::Converter.new'
# for full details.
#
# :invalid => :replace # replace invalid src chars
# :undef => :replace # replace undef'd dst chars
# :replace => "?" # the replacement character
# :xml => :text # undef'd dst chars -> &#xHEX;
# :xml => :attr # also quotes the result and "->"
# There are also three options for converting newlines, see ri.
str = "hello\xff"
str.encode!("ISO8859-1", :invalid => :replace)
is "hello?", str
str = "hello\u0100"
str.encode!("ISO8859-1", :undef => :replace)
is "hello?", str
str = "hello\u0100"
str.encode!("ISO8859-1", :xml => :text)
is "helloĀ", str
str = "hello\"\u0100"
str.encode!("ISO8859-1", :xml => :attr)
is '"hello"Ā"', str
# If you specify both :xml=>:text and :undef=>:replace, :xml wins
# (why not :undef=>:xml instead?)
str = "hello\u0100"
str.encode!("ISO8859-1", :xml => :text, :undef => :replace)
is "helloĀ", str
# REFERENCES: str_transcode() in transcode.c
# 3.2 Symbols
#
# Symbols have an 'encoding' but no 'ascii_only?' property. Rather, the
# encoding is forced to US-ASCII if the symbol contains only ASCII chars.
sym = "gro".to_sym # note that "gro" has encoding UTF-8
is Encoding::US_ASCII,
sym.encoding
sym = :groß
is Encoding::UTF_8,
sym.encoding
assert_raises(NoMethodError) {
sym.ascii_only?
}
# Symbols which consist of the same sequence of bytes but different
# encodings are distinct symbols (see under EQUALITY AND COLLATION)
# 3.3 Regular Expressions
#
# Regexps also have an 'encoding'. They do not have an 'ascii_only?'
# property, but they do have a related 'fixed_encoding?' property, which
# affects the matching compatibility rules (described later). Roughly
# speaking, a regexp with fixed_encoding? is intended to match strings only
# of the same encoding.
#
# The fixed_encoding? property is not visible when you convert the Regexp
# back to a string, unlike the //m, //i and //x flags.
re = /gro/
is [Encoding::US_ASCII, false, "/gro/", "(?-mix:gro)"],
[re.encoding, re.fixed_encoding?, re.inspect, re.to_s]
re = /groß/
is [Encoding::UTF_8, true, "/groß/", "(?-mix:groß)"],
[re.encoding, re.fixed_encoding?, re.inspect, re.to_s]
# A UTF-8-only Regexp literal, even without UTF-8 characters
re = /gro/u
is [Encoding::UTF_8, true, "/gro/", "(?-mix:gro)"],
[re.encoding, re.fixed_encoding?, re.inspect, re.to_s]
if RUBY_VERSION >= "1.9.2"
# Another way to do this (1.9.2 only)
re = Regexp.new("gro", Regexp::FIXEDENCODING)
is [Encoding::UTF_8, true, "/gro/", "(?-mix:gro)"],
[re.encoding, re.fixed_encoding?, re.inspect, re.to_s]
end
assert_raises(NoMethodError) {
/gro/.ascii_only?
}
# 3.4 File and IO objects
#
# These have two properties, 'external_encoding' and 'internal_encoding',
# and a 'set_encoding' method. We'll look at these later.
File.open(__FILE__, "r:UTF-8:ISO-8859-1") do |f|
is Encoding::UTF_8,
f.external_encoding
is Encoding::ISO_8859_1,
f.internal_encoding
end
# REFERENCE: see enc_capable() in encoding.c which detects classes which
# have encoding capabilities.
############# 4. VALID ENCODINGS ##########################
# Since you can change the encoding tags arbitrarily, it's possible to have
# a String which is not a valid sequence of characters in the selected
# character set. You can test for this using the 'valid_encoding?' method.
str = "hello\xdf".force_encoding("ISO-8859-1")
is true,
str.valid_encoding?
str.force_encoding("UTF-8")
is false,
str.valid_encoding?
# Some operations which work on a character-by-character basis,
# such as Regexp matches, will fail if the String has an invalid
# encoding.
str = "aß\xddf".force_encoding("UTF-8")
err = assert_raises(ArgumentError) {
str =~ /./
}
#is "invalid byte sequence in UTF-8",
# err.message
# Operations which treat the String as a sequence of bytes, such as
# writing it out to a file, will still succeed.
str = "aß\xddf".force_encoding("UTF-8")
assert_nothing_raised {
File.open(TMPFILE,"wb") { |f| f.write str }
}
File.delete(TMPFILE)
# Symbols have neither 'force_encoding' nor 'valid_encoding?' methods
assert_raises(NoMethodError) {
:gro.force_encoding("UTF-8")
}
assert_raises(NoMethodError) {
:gro.valid_encoding?
}
# As of ruby 1.9.2, you cannot create a symbol with an invalid encoding.
#
# Prior to this you could create a symbol with an invalid encoding, but you
# could not #inspect it, so irb gave an error if you tried to display one:
#
# >> "hello\xdf".to_sym
# ArgumentError: invalid byte sequence in UTF-8
# from /usr/local/lib/ruby/1.9.1/irb/inspector.rb:84:in `inspect'
#
# That appears to be a problem with the display, not the generation.
# That is: Symbol#inspect raises an exception for these symbols.
if RUBY_VERSION >= "1.9.2"
assert_raises(EncodingError) {
str = "hello\xdf".force_encoding("UTF-8")
sym = str.to_sym
}
else
str = "hello\xdf".force_encoding("UTF-8")
sym = str.to_sym
is Encoding::UTF_8,
sym.encoding
is [104, 101, 108, 108, 111, 223],
sym.to_s.bytes.to_a
assert_raises(ArgumentError) {
sym.inspect
}
end
# Similarly, Regexps do not have 'force_encoding' or 'valid_encoding?'
# methods.
assert_raises(NoMethodError) {
/gro/.force_encoding("UTF-8")
}
assert_raises(NoMethodError) {
/gro/.valid_encoding?
}
# You cannot create a Regexp with invalid characters
assert_raises(RegexpError) { # note: not Encoding::InvalidByteSequenceError
Regexp.new("hello\xdf")
}
############# 5. COMPATIBLE OBJECTS #############
# When an operation occurs on two encoding-aware objects, it will only
# succeed if the objects have "compatible" encodings. Furthermore, the
# encoding of the resultant value has to be chosen.
#
# Compatibility depends not only on the encoding tags of the objects, but
# also in the case of Strings on their contents.
#
# You can perform the test for compatibility, without actually performing
# an operation on the two objects, by using Encoding.compatible?(obj1,obj2).
# The return value is the encoding that the result would have, or nil if
# the objects are not compatible.
#
# Roughly speaking: two objects are compatible if they both have the same
# encoding, or either of them is empty or ascii_only.
#
# Here are the rules more accurately: they are invoked in this sequence,
# and the first matching rule wins.
#
# 1. Two objects are compatible if they have the same encoding; the resultant
# object will have the same encoding. (Note that 'object' includes File
# and Regexp here too, but I'm only testing using String)
a = "groß"
b = "über"
is [Encoding::UTF_8, Encoding::UTF_8, Encoding::UTF_8],
[a.encoding, b.encoding, Encoding.compatible?(a, b)]
# 2. Two objects are compatible if one of them is the empty string; the
# resultant object has the encoding of the other one.
a = "hello\xff"
a.force_encoding "ISO-8859-1"
b = ""
is [Encoding::ISO_8859_1, Encoding::UTF_8, Encoding::ISO_8859_1],
[a.encoding, b.encoding, Encoding.compatible?(a, b)]
a = ""
b = "hello\xff"
b.force_encoding "ISO-8859-1"
is [Encoding::UTF_8, Encoding::ISO_8859_1, Encoding::ISO_8859_1],
[a.encoding, b.encoding, Encoding.compatible?(a, b)]
# This is true even if the empty string has a non-ASCII-compatible encoding
a = "".force_encoding("UTF-16BE")
b = "hello\xff"
b.force_encoding "ISO-8859-1"
is [Encoding::UTF_16BE, Encoding::ISO_8859_1, Encoding::ISO_8859_1],
[a.encoding, b.encoding, Encoding.compatible?(a, b)]
# 3. The objects are not compatible if either uses a non-ASCII-compatible
# encoding
a = "aa".force_encoding "UTF-16BE"
b = "bb"
is [Encoding::UTF_16BE, Encoding::UTF_8, nil],
[a.encoding, b.encoding, Encoding.compatible?(a, b)]
# 4. If one of the objects is not a String but has the encoding "US-ASCII"
# then the objects are compatible, and the result has the encoding of
# the other
a = /a/
b = "bß"
is [Encoding::US_ASCII, Encoding::UTF_8, Encoding::UTF_8],
[a.encoding, b.encoding, Encoding.compatible?(a, b)]
a = "aß"
b = /b/
is [Encoding::UTF_8, Encoding::US_ASCII, Encoding::UTF_8],
[a.encoding, b.encoding, Encoding.compatible?(a, b)]
# 5. If one object is a String which contains only 7-bit ASCII characters
# (ascii_only?), and the other is an object with an ASCII-compatible
# encoding, then the objects are compatible and the result has the
# encoding of the other object.
a = "hello" # ascii_only
b = "\xff".force_encoding "ISO-8859-1" # ascii_compat encoding
is [Encoding::UTF_8, Encoding::ISO_8859_1, Encoding::ISO_8859_1],
[a.encoding, b.encoding, Encoding.compatible?(a,b)]
a = "groß" # ascii_compat encoding
b = "world".force_encoding "ISO-8859-1" # ascii_only
is [Encoding::UTF_8, Encoding::ISO_8859_1, Encoding::UTF_8],
[a.encoding, b.encoding, Encoding.compatible?(a,b)]
a = "hello" # ascii_only
b = "\xff\xff".force_encoding("UTF-16BE") # not ascii_compat
is nil,
Encoding.compatible?(a, b)
# If *both* are strings containing only 7-bit ASCII characters, then the
# result has the encoding of the first.
a = "hello".force_encoding "ISO-8859-1"
b = "world"
is Encoding::ISO_8859_1,
Encoding.compatible?(a,b)
# REFERENCE: rb_enc_compatible() in encoding.c
# Regexps with the fixed_encoding? flag are subject to a slightly stricter
# set of rules. In this case, a regexp which contains only ASCII characters
# is not compatible with a string with a different encoding if that other
# string contains non-ASCII characters.
re = /gro/u
str = "gro".force_encoding("ISO-8859-1")
assert_nothing_raised {
re =~ str
}
# but:
str = "gro\xdf".force_encoding("ISO-8859-1")
assert_raises(Encoding::CompatibilityError) {
re =~ str
}
############# 6. STRING CONCATENATION #############
# When you combine strings using << or +, the above compatibility rules
# are applied. Note that this means that even when you concatenate onto
# an existing string using <<, the encoding of that string may be
# silently changed.
a = "hello"
b = "hello\xdf".force_encoding("ISO-8859-1")
is Encoding::UTF_8,
a.encoding
a << b
is Encoding::ISO_8859_1,
a.encoding
# If the strings are not compatible then an exception is raised:
a = "hello\xdf".force_encoding("ISO-8859-1")
b = "groß"
assert_raises(Encoding::CompatibilityError) {
a << b
}
# This means that care is needed if combining Strings from unknown
# sources. If they are tagged with different encodings, then it might work
# (e.g. if one is empty, or one contains only ASCII characters);
# but at other times you may get an exception.
############# 7. THE BINARY / ASCII-8BIT ENCODING #############
# There is an encoding for binary data, called "ASCII-8BIT". You can also
# refer to this as "BINARY", but this is just an alias; if you ask such
# an object what its encoding is, you'll get "ASCII-8BIT" even if you
# specified it to be "BINARY".
a = "abc"
a.force_encoding "BINARY"
is "ASCII-8BIT",
a.encoding.to_s
# This encoding is ASCII-compatible. It is impossible to mark an object
# as "true binary" (not containing any ASCII text)
#
# Furthermore, this encoding gives you no special exemption from the
# compatibility rules. If you are appending things onto a "binary" string,
# and one of those happens to be tagged with a different character set and
# contain non-ASCII characters, then you will still get an exception.
a = "\xde\xad\xbe\xef".force_encoding("BINARY")
is Encoding::ASCII_8BIT,
a.encoding
b = "groß"
assert_raises(Encoding::CompatibilityError) {
a << b
}
# So if you are trying to build a binary message out of Strings which may be
# tagged with an encoding other than ASCII-8BIT, you need to keep forcing
# encodings.
a = "\xde\xad\xbe\xef".force_encoding("BINARY")
b = "groß"
b.force_encoding "ASCII-8BIT"
assert_nothing_raised {
a << b
}
# Note that it *is* permissible to label a string containing bytes with
# the top bit set as US-ASCII, without raising any error.
a = "\xde\xad\xbe\xef"
assert_nothing_raised {
a.force_encoding("US-ASCII")
}
is Encoding::US_ASCII,
a.encoding
# This makes it somewhat unclear as to what the difference between ASCII-8BIT
# and US-ASCII is supposed to be.
############# 8. SINGLE CHARACTERS #######
# The String#[] method now uses character indexes, rather than byte indexes.
# When given a single integer index it returns a one-character string.
a = "qłer"
is "ł", a[1]
# Strangely, selecting individual characters from the string succeeds even
# if the string has an invalid encoding!
str = "aß\xddf".force_encoding("UTF-8")
assert_equal ["a", "ß", "\xdd", "f", false],
[str[0], str[1], str[2], str[3], str.valid_encoding?]
# String#ord gives a codepoint of the first character in the string:
a = "qłer"
is 322, a[1].ord
is 322, a[1..-1].ord
# Integer#chr without an argument gives a US-ASCII encoding for 0-127,
# an ASCII-8BIT encoding for 128-255, and an exception for higher values.
a = 65.chr
is "A".force_encoding("US-ASCII"), a
a = 223.chr
is "\xdf".force_encoding("ASCII-8BIT"), a
assert_raises(RangeError) {
322.chr
}
# But Integer#chr can now take an encoding as an argument
a = 322.chr("UTF-8")
is "ł", a
# Note that Array#pack with C option silently truncates to 8 bits.
is((322 & 0xff).chr,
[322].pack("C"))
# However, String#% (or Kernel#sprintf) respects the encoding of the format
# string.
is "abcł",
"abc%c" % 322
############# 9. EQUALITY AND COLLATION ############
# How does encoding affect string equality and ordering when sorting?
# Strings are subject to a subset of compatibility rules defined above.
# Strings are equal if they are of the same length and have the same
# byte content, and are "comparable". Strings are comparable if either
# of them is empty; or they have the same encoding; or they have different
# encodings but both are ascii-compatible encodings and both strings are
# only using 7-bit characters.
a = "hello"
b = "hello".force_encoding("ISO-8859-1")
is true,
a == b
a = "groß"
b = "groß".force_encoding("ISO-8859-1")
is false,
a == b
# If the RHS is not a string, but responds to :to_str, then == is called
# on the RHS with the LHS as an argument, and the result converted to
# either true or false. Note that to_str is not called!
a = "hello"
b = Object.new
def b.to_str
raise "Not called"
end
def b.==(other)
:dummy_true_value
end
is true,
a == b
# REFERENCE: rb_str_equal, str_eql, rb_str_comparable in string.c
# Collation is done by means of the spaceship operator (<=>).
# If the strings are different sequences of bytes then a simple bytewise
# comparison is used, regardless of encoding. Note that it does *not* use
# the Unicode Collation Algorithm (UCA).
a = "hello"
b = "hellO"
is 1,
a <=> b # but in UCA, hello comes before hellO
# If one string is prefix of the other then the longer string wins.
# If the strings are bytewise equal and the encodings are equal, it returns 0.
# If the strings are bytewise equal and are comparable, it returns 0.
a = "hello"
b = "hello".force_encoding("ISO-8859-1")
is 0,
a <=> b
# If the strings are bytewise equal but not comparable, it returns -1 or 1
# dependent on an internal ordering of encodings.
a = "groß"
b = "groß".force_encoding("ISO-8859-1")
is -1,
a <=> b
is 1,
b <=> a
# If the RHS is not a string, but responds to :to_str and :<=>, then the
# spaceship operator of the RHS object is used, and the result negated.
# (That is, effectively the arguments are swapped). Note: the existence of
# :to_str is checked but it is not called.
a = "hello"
b = Object.new
def b.<=>(x)
99
end
def b.to_str
raise "Not called"
end
is -99,
a <=> b
# Otherwise if the RHS is not a string, nil is returned (no exception raised)
a = "hello"
b = Object.new
is nil,
a <=> b
# REFERENCE: rb_str_cmp_m in string.c
# It's important to realise that ruby 1.9 does not sort by codepoints, it
# sorts by bytes. It's a convenient property of UTF-8 encoding that lower
# codepoints sort before higher ones, but this does not work for all
# encodings, not even all encodings of unicode. Here's an example of where
# the distinction is important:
s1 = 97.chr("UTF-8") # a
s2 = 257.chr("UTF-8") # ā
is true, s1 < s2 # expected
s1 = 97.chr("UTF-16LE") # a
s2 = 257.chr("UTF-16LE") # ā
is false, s1 < s2 # not ordered by codepoint
# In ruby 1.9 these questions have to be considered for symbols too, since
# symbols now have string-like properties. As far as I can see, the same
# rules are applied to symbols as for strings. In particular, this means
# that symbols which have the same sequence of bytes but different encodings
# are different symbols
s1 = "groß".force_encoding("UTF-8").to_sym
s2 = "groß".force_encoding("ISO-8859-1").to_sym
is false,
s1.object_id == s2.object_id
is false,
s1 == s2
is -1,
s1 <=> s2
# Symbols cannot usefully be compared directly to Strings though.
is nil,
:foo <=> "foo"
is false,
:foo == "foo"
# Symbols have a to_s but not to_str method.
assert_raises(NoMethodError) {
:foo.to_str
}
# Regular expressions can be tested for equality, and differ if they have
# differing encodings...
a1 = "groß".force_encoding("UTF-8")
a2 = "groß".force_encoding("ISO-8859-1")
is true,
Regexp.new(a1) == Regexp.new(a1)
is false,
Regexp.new(a1) == Regexp.new(a2)
# ... but they do not collate. Prior to 1.9.2, Regexp#<=> did not exist.
# In 1.9.2, Regexp#<=> returns 0 for equal regexps and nil otherwise.
# Object#<=> exists in 1.9.2 too.
if RUBY_VERSION >= "1.9.2"
is 0, Regexp.new("foo") <=> Regexp.new("foo")
is nil, Regexp.new("foo") <=> Regexp.new("bar")
is nil, Object.new <=> Object.new
else
assert_raises(NoMethodError) {
Regexp.new("foo") <=> Regexp.new("foo")
}
end
############# 10. HASH AND EQL? ############
# When Strings are used as Hash keys, the #hash and #eql? methods
# are used to determine whether Strings are the same. The rules
# for handling encodings are:
#
# - #hash includes the encoding in the hash calculation only if the
# string is not ascii_only?
# - #eql? returns false if the strings are not comparable (i.e. have
# different encodings and either is not ascii_only?)
s1 = "hello"
s2 = "hello".force_encoding("ISO-8859-1")
is true,
s1.hash == s2.hash
is true,
s1.eql?(s2)
s1 = "groß"
s2 = "groß".force_encoding("ISO-8859-1")
is false,
s1.hash == s2.hash
is false,
s1.eql?(s2)
# REFERENCES: rb_str_eq, rb_eql, rb_str_hash in string.c
############# 11. UPPER AND LOWER CASE #############
# There are five cases I can see where Ruby needs to distinguish and/or
# convert between 'lower case' and 'upper case' characters.
# 11.1 Regular expression character classes
# These are handled by the Oniguruma regexp library. Each encoding has
# its own rules for which characters are upper case, lower case, or
# neither. As far as I can see these are fixed per encoding - there is
# no variation per language or locale.
s = "übÊr"
is 0,
s =~ /[[:lower:]]/
is 2,
s =~ /[[:upper:]]/
# 11.2 Source parsing (distinguishing local variables from constants)
# The set of characters allowed in identifiers is defined by
# rb_enc_isalnum() plus underscore. "isalnum" is delegated to Oniguruma,
# and is also defined per character set.
is [1, 2],
eval(<<EOS)
# encoding: UTF-8
SCHÖN = 1 # constant
schloß = 2 # variable
[self.class.const_get(:SCHÖN), schloß]
EOS
# The code delegates the isupper test to Oniguruma too, which means that
# you'd think that a constant could start with a non-ASCII upper-case
# character, but in fact it overrides this later. Anything which starts
# with a non-ASCII uppercase character is treated as a local variable.