-
Notifications
You must be signed in to change notification settings - Fork 561
/
Copy pathlocale.c
10965 lines (9040 loc) · 418 KB
/
locale.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* locale.c
*
* Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
* 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others
*
* You may distribute under the terms of either the GNU General Public
* License or the Artistic License, as specified in the README file.
*
*/
/*
* A Elbereth Gilthoniel,
* silivren penna míriel
* o menel aglar elenath!
* Na-chaered palan-díriel
* o galadhremmin ennorath,
* Fanuilos, le linnathon
* nef aear, si nef aearon!
*
* [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"]
*/
/* utility functions for handling locale-specific stuff like what
* character represents the decimal point.
*
* All C programs have an underlying locale. Perl code generally doesn't pay
* any attention to it except within the scope of a 'use locale'. For most
* categories, it accomplishes this by just using different operations if it is
* in such scope than if not. However, various libc functions called by Perl
* are affected by the LC_NUMERIC category, so there are macros in perl.h that
* are used to toggle between the current locale and the C locale depending on
* the desired behavior of those functions at the moment. And, LC_MESSAGES is
* switched to the C locale for outputting the message unless within the scope
* of 'use locale'.
*
* There is more than the typical amount of variation between platforms with
* regard to locale handling. At the end of these introductory comments, are
* listed various relevent Configuration options, including some that can be
* used to pretend to some extent that this is being developed on a different
* platform than it actually is. This allows you to make changes and catch
* some errors without having access to those other platforms.
*
* This code now has multi-thread-safe locale handling on systems that support
* that. This is completely transparent to most XS code. On earlier systems,
* it would be possible to emulate thread-safe locales, but this likely would
* involve a lot of locale switching, and would require XS code changes.
* Macros could be written so that the code wouldn't have to know which type of
* system is being used.
*
* Table-driven code is used for simplicity and clarity, as many operations
* differ only in which category is being worked on. However the system
* categories need not be small contiguous integers, so do not lend themselves
* to table lookup. Instead we have created our own equivalent values which
* are all small contiguous non-negative integers, and translation functions
* between the two sets. For category 'LC_foo', the name of our index is
* LC_foo_INDEX_. Various parallel tables, indexed by these, are used for the
* translation. The tables are generated at compile-time based on platform
* characteristics and Configure options. They hide from the code many of the
* vagaries of the different locale implementations out there.
*
* On unthreaded perls, most operations expand out to just the basic
* setlocale() calls. That sort of is true on threaded perls on modern Windows
* systems where the same API, after set up, is used for thread-safe locale
* handling. (But there are complications on Windows due to internal character
* set issues.) On other systems, there is a completely different API,
* specified in POSIX 2008, to do thread-safe locales. On these systems, our
* bool_setlocale_2008_i() function is used to hide the different API from the
* outside. This makes it completely transparent to most XS code.
*
* A huge complicating factor is that the LC_NUMERIC category is normally held
* in the C locale, except during those relatively rare times when it needs to
* be in the underlying locale. There is a bunch of code to accomplish this,
* and to allow easy switches from one state to the other.
*
* In addition, the setlocale equivalents have versions for the return context,
* 'void' and 'bool', besides the full return value. This can present
* opportunities for avoiding work. We don't have to necessarily create a safe
* copy to return if no return is desired.
*
* There are 3.5 major implementations here; which one chosen depends on what
* the platform has available, and Configuration options.
*
* 1) Raw posix_setlocale(). This implementation is basically the libc
* setlocale(), with possibly minor tweaks. This is used for startup, and
* always for unthreaded perls, and when the API for safe locale threading
* is identical to the unsafe API (Windows, currently).
*
* This implementation is composed of two layers:
* a) posix_setlocale() implements the libc setlocale(). In most cases,
* it is just an alias for the libc version. But Windows doesn't
* fully conform to the POSIX standard, and this is a layer on top of
* libc to bring it more into conformance. And in Configurations
* where perl is to ignore some locale categories that the libc
* setlocale() knows about, there is a layer to cope with that.
* b) stdized_setlocale() is a layer above a) that fixes some vagaries in
* the return value of the libc setlocale(). On most platforms this
* layer is empty; in order to be activated, it requires perl to be
* Configured with a parameter indicating the platform's defect. The
* current ones are listed at the definition of the macro.
*
* 2) An implementation that adds a minimal layer above implementation 1),
* making that implementation uninterruptible and returning a
* per-thread/per-category value.
*
* 3a and 3b) An implementation of POSIX 2008 thread-safe locale handling,
* hiding from the programmer the completely different API for this.
* This automatically makes almost all code thread-safe without need for
* changes. This implementation is chosen on threaded perls when the
* platform properly supports the POSIX 2008 functions, and when there is no
* manual override to the contrary passed to Configure.
*
* 3a) is when the platform has a documented reliable querylocale() function
* or equivalent that is selected to be used.
* 3b) is when we have to emulate that functionality.
*
* Unfortunately, it seems that some platforms that claim to support these
* are buggy, in one way or another. There are workarounds encoded here,
* where feasible, for platforms where the bugs are amenable to that
* (glibc, for example). But other platforms instead don't use this
* implementation.
*
* z/OS (os390) is an outlier. Locales really don't work under threads when
* either the radix character isn't a dot, or attempts are made to change
* locales after the first thread is created. The reason is that IBM has made
* it thread-safe by refusing to change locales (returning failure if
* attempted) any time after an application has called pthread_create() to
* create another thread. The expectation is that an application will set up
* its locale information before the first fork, and be stable thereafter. But
* perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
* the other toggles, which are less common.
*
* Associated with each implementation are three sets of macros that translate
* a consistent API into what that implementation needs. Each set consists of
* three macros with the suffixes:
* _c Means the argument is a locale category number known at compile time.
* An example would be LC_TIME. This token is a compile-time constant
* and can be passed to a '_c' macro.
* _r Means the argument is a locale category number whose value might not be
* known until runtime
* _i Means the argument is our internal index of a locale category
*
* The three sets are: ('_X' means one of '_c', '_r', '_i')
* 1) bool_setlocale_X()
* This calls the appropriate setlocale()-equivalent for the
* implementation, with the category and new locale. The input locale is
* not necessarily valid, so the return is true or false depending on
* whether or not the setlocale() succeeded. This is not used for
* querying the locale, so the input locale must not be NULL.
*
* This macro is suitable for toggling the locale back and forth during an
* operation. For example, the names of days and months under LC_TIME are
* strings that are also subject to LC_CTYPE. If the locales of these two
* categories differ, mojibake can result on many platforms. The code
* here will toggle LC_CTYPE into the locale of LC_TIME temporarily to
* avoid this.
*
* Several categories require extra work when their locale is changed.
* LC_CTYPE, for example, requires the calculation of the table of which
* characters fold to which others under /i pattern matching or fc(), as
* folding is not a concept in POSIX. This table isn't needed when the
* LC_CTYPE locale gets toggled during an operation, and will be toggled
* back before return to the caller. To save work that would be
* discarded, the bool_setlocale_X() implementations don't do this extra
* work. Instead, there is a separate function for just this purpose to
* be done before control is transferred back to the external caller. All
* categories that have such requirements have such a function. The
* update_functions[] array contains pointers to them (or NULL for
* categories which don't need a function).
*
* Care must be taken to remember to call the separate function before
* returning to an external caller, and to not use things it updates
* before its call. An alternative approach would be to have
* bool_setlocale_X() always call the update, which would return
* immediately if a flag wasn't set indicating it was time to actually
* perform it.
*
* 2) void_setlocale_X()
* This is like bool_setlocale_X(), but it is used only when it is
* expected that the call must succeed, or something is seriously wrong.
* A panic is issued if it fails. The caller uses this form when it just
* wants to assume things worked.
*
* 3) querylocale_X()
* This returns a string that specifies the current locale for the given
* category given by the input argument. The string is safe from other
* threads zapping it, and the caller need not worry about freeing it, but
* it may be mortalized, so must be copied if you need to preserve it
* across calls, or long term. This returns the actual current locale,
* not the nominal. These differ, for example, when LC_NUMERIC is
* supposed to be a locale whose decimal radix character is a comma. As
* mentioned above, Perl actually keeps this category set to C in such
* circumstances so that XS code can just assume a dot radix character.
* querylocale_X() returns the locale that libc has stored at this moment,
* so most of the time will return a locale whose radix character is a
* dot. The macro query_nominal_locale_i() can be used to get the nominal
* locale that an external caller would expect, for all categories except
* LC_ALL. For that, you can use the function
* S_calculate_LC_ALL_string(). Or S_native_querylocale_i() will operate
* on any category.
*
* The underlying C API that this implements uses category numbers, hence the
* code is structured to use '_r' at the API level to convert to indexes, which
* are then used internally with the '_i' forms.
*
* The splitting apart into setting vs querying means that the return value of
* the bool macros is not subject to potential clashes with other threads,
* eliminating any need for the calling code to worry about that and get it
* wrong. Whereas, you do have to think about thread interactions when using a
* query.
*
* Additionally, for the implementations where there aren't any complications,
* a setlocale_i() is defined that is like plain setlocale(), returning the new
* locale. Thus it combines a bool_setlocale_X() with a querylocale_X(). It
* is used only for performance on implementations that allow it, such as
* non-threaded perls.
*
* There are also a few other macros herein that use this naming convention to
* describe their category parameter.
*
* Relevant Configure options
*
* -Accflags=-DNO_LOCALE
* This compiles perl to always use the C locale, ignoring any
* attempts to change it. This could be useful on platforms with a
* crippled locale implementation.
*
* -Accflags=-DNO_THREAD_SAFE_LOCALE
* Even if thread-safe operations are available on this platform and
* would otherwise be used (because this is a perl with multiplicity),
* perl is compiled to not use them. This could be useful on
* platforms where the libc is buggy.
*
* -Accflags=-DNO_POSIX_2008_LOCALE
* Even if the libc locale operations specified by the Posix 2008
* Standard are available on this platform and would otherwise be used
* (because this is a threaded perl), perl is compiled to not use
* them. This could be useful on platforms where the libc is buggy.
* This is like NO_THREAD_SAFE_LOCALE, but has no effect on platforms
* that don't have these functions.
*
* -Accflags=-DUSE_POSIX_2008_LOCALE
* Normally, setlocale() is used for locale operations on perls
* compiled without threads. This option causes the locale operations
* defined by the Posix 2008 Standard to always be used instead. This
* could be useful on platforms where the libc setlocale() is buggy.
*
* -Accflags=-DNO_THREAD_SAFE_QUERYLOCALE
* This applies only to platforms that have a querylocale() libc
* function. perl assumes that that function is thread-safe, unless
* overridden by this, typically in a hints file. When overridden,
* querylocale() is called only while the locale mutex is locked, and
* the result is copied to a per-thread place before unlocking.
*
* -Accflags=-DNO_USE_NL_LOCALE_NAME
* glibc has an undocumented equivalent function to querylocale(),
* which our experience indicates is reliable. But you can forbid its
* use by specifying this Configure option (with no effect on systems
* lacking it). When this is function is enabled, it removes the need
* for perl to keep its own records, hence is more efficient and
* guaranteed to be accurate.
*
* -Accflags=-DNO_LOCALE_CTYPE
* -Accflags=-DNO_LOCALE_NUMERIC
* etc.
*
* If the named category(ies) does(do) not exist on this platform,
* these have no effect. Otherwise they cause perl to be compiled to
* always keep the named category(ies) in the C locale.
*
* -Accflags=-DHAS_BROKEN_SETLOCALE_QUERY_LC_ALL
* This would be set in a hints file to tell perl that doing a libc
* setlocale(LC_ALL, NULL)
* can give erroneous results, and perl will compensate to get the
* correct results. This is known to be a problem in earlier AIX
* versions
*
* -Accflags=-DHAS_BROKEN_LANGINFO_CODESET
* This would be set in a hints file to tell perl that doing a libc
* nl_langinfo(CODESET)
* can give empty results. This causes perl to be compiled to not use
* nl_langinfo() to determine if the current locale is a UTF-8 one or
* not. Perl continues to presume that a non-empty return from
* nl_langinfo() is correct, and empty returns are just passed on to
* the caller. khw can't figure out a workaround for this portion of
* the bug. The bug exists on Darwin.
*
* -Accflags=-DHAS_LF_IN_SETLOCALE_RETURN
* This would be set in a hints file to tell perl that a libc
* setlocale() can return results containing \n characters that need
* to be stripped off. khw believes there aren't any such platforms
* still in existence.
*
* -Accflags=-DLIBC_HANDLES_MISMATCHED_CTYPE
* Consider the name of a month in some language, Chinese for example.
* If LC_TIME has been set to a Chinese locale, strftime() can be used
* to generate the Chinese month name for any given date, by using the
* %B format. But also suppose that LC_CTYPE is set to, say, "C".
* The return from strftime() on many platforms will be mojibake given
* that no Chinese month name is composed of just ASCII characters.
* Perl handles this for you by automatically toggling LC_CTYPE to
* whatever LC_TIME is during the execution of strftime(), and
* afterwards restoring it to its prior value. But the strftime()
* (and similar functions) in some libc implementations already do
* this toggle, meaning perl's action is redundant. You can tell perl
* that a libc does this by setting this Configure option, and it will
* skip its syncing LC_CTYPE and whatever the other locale is.
* Currently, perl ignores this Configuration option and syncs anyway
* for LC_COLLATE-related operations, due to perl's internal needs.
*
* -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION
* This is used when developing Perl on a platform that uses
* 'name=value;' notation to represent LC_ALL when not all categories
* are the same. When so compiled, much of the code gets compiled
* and exercised that applies to platforms that instead use positional
* notation. This allows for finding many bugs in that portion of the
* implementation, without having to access such a platform.
*
* -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES
* This is used when developing Perl on a non-Windows platform to
* compile and exercise much of the locale-related code that instead
* applies to MingW platforms that don't use the more modern UCRT
* library. This allows for finding many bugs in that portion of the
* implementation, without having to access such a platform.
*/
/* If the environment says to, we can output debugging information during
* initialization. This is done before option parsing, and before any thread
* creation, so can be a file-level static. (Must come before #including
* perl.h) */
#include "config.h"
/* Returns the Unix errno portion; ignoring any others. This is a macro here
* instead of putting it into perl.h, because unclear to khw what should be
* done generally. */
#define GET_ERRNO saved_errno
#ifdef DEBUGGING
static int debug_initialization = 0;
# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
# define DEBUG_LOCALE_INITIALIZATION_ debug_initialization
# ifdef HAS_EXTENDED_OS_ERRNO
/* Output the non-zero errno and/or the non-zero extended errno */
# define DEBUG_ERRNO \
dSAVE_ERRNO; dTHX; \
int extended = get_extended_os_errno(); \
const char * errno_string; \
if (GET_ERRNO == 0) { /* Skip output if both errno types are 0 */ \
if (LIKELY(extended == 0)) errno_string = ""; \
else errno_string = Perl_form(aTHX_ "; $^E=%d", extended); \
} \
else if (LIKELY(extended == GET_ERRNO)) \
errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO); \
else errno_string = Perl_form(aTHX_ "; $!=%d, $^E=%d", \
GET_ERRNO, extended);
# else
/* Output the errno, if non-zero */
# define DEBUG_ERRNO \
dSAVE_ERRNO; \
const char * errno_string = ""; \
if (GET_ERRNO != 0) { \
dTHX; \
errno_string = Perl_form(aTHX_ "; $!=%d", GET_ERRNO); \
}
# endif
/* Automatically include the caller's file, and line number in debugging output;
* and the errno (and/or extended errno) if non-zero. On threaded perls add
* the aTHX too. */
# if defined(MULTIPLICITY) && ! defined(NO_LOCALE_THREADS)
# define DEBUG_PRE_STMTS \
DEBUG_ERRNO; \
PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf ": 0x%p%s: ", \
__FILE__, (line_t)__LINE__, aTHX_ \
errno_string);
# else
# define DEBUG_PRE_STMTS \
DEBUG_ERRNO; \
PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf "%s: ", \
__FILE__, (line_t)__LINE__, \
errno_string);
# endif
# define DEBUG_POST_STMTS RESTORE_ERRNO;
#else
# define debug_initialization 0
# define DEBUG_INITIALIZATION_set(v)
# define DEBUG_PRE_STMTS
# define DEBUG_POST_STMTS
#endif
#include "EXTERN.h"
#define PERL_IN_LOCALE_C
#include "perl.h"
/* Some platforms require LC_CTYPE to be congruent with the category we are
* looking for. XXX This still presumes that we have to match COLLATE and
* CTYPE even on platforms that apparently handle this. */
#if defined(USE_LOCALE_CTYPE) && ! defined(LIBC_HANDLES_MISMATCHED_CTYPE)
# define WE_MUST_DEAL_WITH_MISMATCHED_CTYPE /* no longer used; kept for
possible future use */
# define start_DEALING_WITH_MISMATCHED_CTYPE(locale) \
const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale)
# define end_DEALING_WITH_MISMATCHED_CTYPE(locale) \
restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
#else
# define start_DEALING_WITH_MISMATCHED_CTYPE(locale)
# define end_DEALING_WITH_MISMATCHED_CTYPE(locale)
#endif
#ifdef WIN32_USE_FAKE_OLD_MINGW_LOCALES
/* Use -Accflags=-DWIN32_USE_FAKE_OLD_MINGW_LOCALES on a POSIX or *nix box
* to get a semblance of pretending the locale handling is that of a MingW
* that doesn't use UCRT (hence 'OLD' in the name). This exercizes code
* paths that are not compiled on non-Windows boxes, and allows for ASAN
* and PERL_MEMLOG. This is thus a way to see if locale.c on Windows is
* likely going to compile, without having to use a real Win32 box. And
* running the test suite will verify to a large extent our logic and memory
* allocation handling for such boxes. Of course the underlying calls are
* to the POSIX libc, so any differences in implementation between those and
* the Windows versions will not be caught by this. */
# define WIN32
# undef P_CS_PRECEDES
# undef CURRENCY_SYMBOL
# define CP_UTF8 -1
# undef _configthreadlocale
# define _configthreadlocale(arg) NOOP
# define MultiByteToWideChar(cp, flags, byte_string, m1, wstring, req_size) \
(PERL_UNUSED_ARG(cp), \
mbsrtowcs(wstring, &(byte_string), req_size, NULL) + 1)
# define WideCharToMultiByte(cp, flags, wstring, m1, byte_string, \
req_size, default_char, found_default_char) \
(PERL_UNUSED_ARG(cp), \
wcsrtombs(byte_string, &(wstring), req_size, NULL) + 1)
# ifdef USE_LOCALE
static const wchar_t * wsetlocale_buf = NULL;
static Size_t wsetlocale_buf_size = 0;
# ifdef MULTIPLICITY
static PerlInterpreter * wsetlocale_buf_aTHX = NULL;
# endif
STATIC
const wchar_t *
S_wsetlocale(const int category, const wchar_t * wlocale)
{
/* Windows uses a setlocale that takes a wchar_t* locale. Other boxes
* don't have this, so this Windows replacement converts the wchar_t input
* to plain 'char*', calls plain setlocale(), and converts the result back
* to 'wchar_t*' */
const char * byte_locale = NULL;
if (wlocale) {
byte_locale = Win_wstring_to_byte_string(CP_UTF8, wlocale);
}
const char * byte_result = setlocale(category, byte_locale);
Safefree(byte_locale);
if (byte_result == NULL) {
return NULL;
}
const wchar_t * wresult = Win_byte_string_to_wstring(CP_UTF8, byte_result);
if (! wresult) {
return NULL;
}
/* Emulate a global static memory return from wsetlocale(). This currently
* leaks at process end; would require changing LOCALE_TERM to fix that */
Size_t string_size = wcslen(wresult) + 1;
if (wsetlocale_buf_size == 0) {
Newx(wsetlocale_buf, string_size, wchar_t);
wsetlocale_buf_size = string_size;
# ifdef MULTIPLICITY
dTHX;
wsetlocale_buf_aTHX = aTHX;
# endif
}
else if (string_size > wsetlocale_buf_size) {
Renew(wsetlocale_buf, string_size, wchar_t);
wsetlocale_buf_size = string_size;
}
Copy(wresult, wsetlocale_buf, string_size, wchar_t);
Safefree(wresult);
return wsetlocale_buf;
}
# define _wsetlocale(category, wlocale) S_wsetlocale(category, wlocale)
# endif
#endif /* WIN32_USE_FAKE_OLD_MINGW_LOCALES */
/* 'for' loop headers to hide the necessary casts */
#define for_category_indexes_between(i, m, n) \
for (locale_category_index i = (locale_category_index) (m); \
i <= (locale_category_index) (n); \
i = (locale_category_index) ((int) i + 1))
#define for_all_individual_category_indexes(i) \
for_category_indexes_between(i, 0, LC_ALL_INDEX_ - 1)
#define for_all_but_0th_individual_category_indexes(i) \
for_category_indexes_between(i, 1, LC_ALL_INDEX_ - 1)
#define for_all_category_indexes(i) \
for_category_indexes_between(i, 0, LC_ALL_INDEX_)
#ifdef USE_LOCALE
# if defined(USE_FAKE_LC_ALL_POSITIONAL_NOTATION) && defined(LC_ALL)
/* This simulates an underlying positional notation for LC_ALL when compiled on
* a system that uses name=value notation. Use this to develop on Linux and
* make a quick check that things have some chance of working on a positional
* box. Enable by adding to the Congfigure parameters:
* -Accflags=USE_FAKE_LC_ALL_POSITIONAL_NOTATION
*
* NOTE it redefines setlocale() and usequerylocale()
* */
STATIC const char *
S_positional_name_value_xlation(const char * locale, bool direction)
{ /* direction == 1 is from name=value to positional
direction == 0 is from positional to name=value */
assert(locale);
dTHX;
const char * individ_locales[LC_ALL_INDEX_] = { NULL };
/* This parses either notation */
switch (parse_LC_ALL_string(locale,
(const char **) &individ_locales,
no_override, /* Handled by other code */
false, /* Return only [0] if suffices */
false, /* Don't panic on error */
__LINE__))
{
default: /* Some compilers don't realize that below is the complete
list of the available enum values */
case invalid:
return NULL;
case no_array:
return locale;
case only_element_0:
SAVEFREEPV(individ_locales[0]);
return individ_locales[0];
case full_array:
{
calc_LC_ALL_format format = (direction)
? EXTERNAL_FORMAT_FOR_SET
: INTERNAL_FORMAT;
const char * retval = calculate_LC_ALL_string(individ_locales,
format,
WANT_TEMP_PV,
__LINE__);
for_all_individual_category_indexes(i) {
Safefree(individ_locales[i]);
}
return retval;
}
}
}
STATIC const char *
S_positional_setlocale(int cat, const char * locale)
{
if (cat != LC_ALL) return setlocale(cat, locale);
if (locale && strNE(locale, "")) {
locale = S_positional_name_value_xlation(locale, 0);
if (! locale) return NULL;
}
locale = setlocale(cat, locale);
if (locale == NULL) return NULL;
return S_positional_name_value_xlation(locale, 1);
}
# undef setlocale
# define setlocale(a,b) S_positional_setlocale(a,b)
# ifdef USE_POSIX_2008_LOCALE
STATIC locale_t
S_positional_newlocale(int mask, const char * locale, locale_t base)
{
assert(locale);
if (mask != LC_ALL_MASK) return newlocale(mask, locale, base);
if (strNE(locale, "")) locale = S_positional_name_value_xlation(locale, 0);
if (locale == NULL) return NULL;
return newlocale(LC_ALL_MASK, locale, base);
}
# undef newlocale
# define newlocale(a,b,c) S_positional_newlocale(a,b,c)
# endif
# endif
#endif /* End of fake positional notation */
#include "reentr.h"
#ifdef I_WCHAR
# include <wchar.h>
#endif
#ifdef I_WCTYPE
# include <wctype.h>
#endif
/* The main errno that gets used is this one, on platforms that support it */
#ifdef EINVAL
# define SET_EINVAL SETERRNO(EINVAL, LIB_INVARG)
#else
# define SET_EINVAL
#endif
/* This is a starting guess as to when this is true. It definititely isn't
* true on *BSD where positional LC_ALL notation is used. Likely this will end
* up being defined in hints files. */
#ifdef PERL_LC_ALL_USES_NAME_VALUE_PAIRS
# define NEWLOCALE_HANDLES_DISPARATE_LC_ALL
#endif
/* But regardless, we have to look at individual categories if some are
* ignored. */
#ifdef HAS_IGNORED_LOCALE_CATEGORIES_
# undef NEWLOCALE_HANDLES_DISPARATE_LC_ALL
#endif
#ifdef USE_LOCALE
/* Not all categories need be set to the same locale. This macro determines if
* 'name' which represents LC_ALL is uniform or disparate. There are two
* situations: 1) the platform uses unordered name=value pairs; 2) the platform
* uses ordered positional values, with a separator string between them */
# ifdef PERL_LC_ALL_SEPARATOR /* positional */
# define is_disparate_LC_ALL(name) cBOOL(instr(name, PERL_LC_ALL_SEPARATOR))
# else /* name=value */
/* In the, hopefully never occurring, event that the platform doesn't use
* either mechanism for disparate LC_ALL's, assume the name=value pairs
* form, rather than taking the extreme step of refusing to compile. Many
* programs won't have disparate locales, so will generally work */
# define PERL_LC_ALL_SEPARATOR ";"
# define is_disparate_LC_ALL(name) cBOOL( strchr(name, ';') \
&& strchr(name, '='))
# endif
/* It is possible to compile perl to always keep any individual category in the
* C locale. This would be done where the implementation on a platform is
* flawed or incomplete. At the time of this writing, for example, OpenBSD has
* not implemented LC_COLLATE beyond the C locale. The 'category_available[]'
* table is a bool that says whether a category is changeable, or must be kept
* in C. This macro substitutes C for the locale appropriately, expanding to
* nothing on the more typical case where all possible categories present on
* the platform are handled. */
# if defined(HAS_IGNORED_LOCALE_CATEGORIES_) \
|| defined(HAS_MISSING_LANGINFO_ITEM_)
# define need_to_override_category(i) (! category_available[i])
# define override_ignored_category(i, new_locale) \
((need_to_override_category(i)) ? "C" : (new_locale))
# else
# define need_to_override_category(i) 0
# define override_ignored_category(i, new_locale) (new_locale)
# endif
PERL_STATIC_INLINE const char *
S_mortalized_pv_copy(pTHX_ const char * const pv)
{
PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
/* Copies the input pv, and arranges for it to be freed at an unspecified
* later time. */
if (pv == NULL) {
return NULL;
}
const char * copy = savepv(pv);
SAVEFREEPV(copy);
return copy;
}
#endif
/* Default values come from the C locale */
#define C_codeset "ANSI_X3.4-1968" /* Only in some Configurations, and usually
a single instance, so is a #define */
static const char C_decimal_point[] = ".";
#if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
# define HAS_SOME_LANGINFO
#endif
#if (defined(USE_LOCALE_NUMERIC) && ! defined(TS_W32_BROKEN_LOCALECONV)) \
|| ! ( defined(USE_LOCALE_NUMERIC) \
&& (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)))
static const char C_thousands_sep[] = "";
#endif
/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
* return of setlocale(), then this is extremely likely to be the C or POSIX
* locale. However, the output of setlocale() is documented to be opaque, but
* the odds are extremely small that it would return these two strings for some
* other locale. Note that VMS includes many non-ASCII characters in these two
* locales as controls and punctuation (below are hex bytes):
* cntrl: 84-97 9B-9F
* punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD
* Oddly, none there are listed as alphas, though some represent alphabetics
* https://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */
#define isNAME_C_OR_POSIX(name) \
( (name) != NULL \
&& (( *(name) == 'C' && (*(name + 1)) == '\0') \
|| strEQ((name), "POSIX")))
/* If this interface to nl_langinfo() isn't defined by embed.fnc, it means it
* isn't available on this platform, so instead emulate it */
#ifndef langinfo_sv_i
# define langinfo_sv_i(i, c, l, s, u) \
(PERL_UNUSED_VAR(c), emulate_langinfo(i, l, s, u))
#endif
/* In either case, create a version that takes things like 'LC_NUMERIC' as a
* parameter */
#define langinfo_sv_c(item, category, locale, sv, utf8ness) \
langinfo_sv_i(item, category##_INDEX_, locale, sv, utf8ness)
/* The normal method for interfacing with nl_langinfo() in this file is to use
* a scratch buffer (whose existence is hidden from the caller by these
* macros). */
#define langinfo_i(item, index, locale, utf8ness) \
langinfo_sv_i(item, index, locale, PL_scratch_langinfo, utf8ness)
#define langinfo_c(item, category, locale, utf8ness) \
langinfo_i(item, category##_INDEX_, locale, utf8ness)
#ifndef USE_LOCALE /* A no-op unless locales are enabled */
# define toggle_locale_i(index, locale) \
((const char *) (PERL_UNUSED_VAR(locale), NULL))
# define restore_toggled_locale_i(index, locale) PERL_UNUSED_VAR(locale)
#else
# define toggle_locale_i(index, locale) \
S_toggle_locale_i(aTHX_ index, locale, __LINE__)
# define restore_toggled_locale_i(index, locale) \
S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
#endif
# define toggle_locale_c(cat, locale) toggle_locale_i(cat##_INDEX_, locale)
# define restore_toggled_locale_c(cat, locale) \
restore_toggled_locale_i(cat##_INDEX_, locale)
#ifdef USE_LOCALE
# ifdef DEBUGGING
# define setlocale_debug_string_i(index, locale, result) \
my_setlocale_debug_string_i(index, locale, result, __LINE__)
# define setlocale_debug_string_c(category, locale, result) \
setlocale_debug_string_i(category##_INDEX_, locale, result)
# define setlocale_debug_string_r(category, locale, result) \
setlocale_debug_string_i(get_category_index(category), \
locale, result)
# endif
/* On systems without LC_ALL, pretending it exists anyway simplifies things.
* Choose a value for it that is very unlikely to clash with any actual
* category */
# define FAKE_LC_ALL PERL_INT_MIN
/* Below are parallel arrays for locale information indexed by our mapping of
* category numbers into small non-negative indexes. locale_table.h contains
* an entry like this for each individual category used on this system:
* PERL_LOCALE_TABLE_ENTRY(CTYPE, S_new_ctype)
*
* Each array redefines PERL_LOCALE_TABLE_ENTRY to generate the information
* needed for that array, and #includes locale_table.h to get the valid
* categories.
*
* An entry for the conglomerate category LC_ALL is added here, immediately
* following the individual categories. (The treatment for it varies, so can't
* be in locale_table.h.)
*
* Following this, each array ends with an entry for illegal categories. All
* category numbers unknown to perl get mapped to this entry. This is likely
* to be a parameter error from the calling program; but it could be that this
* platform has a category we don't know about, in which case it needs to be
* added, using the paradigm of one of the existing categories. */
/* The first array is the locale categories perl uses on this system, used to
* map our index back to the system's category number. */
STATIC const int categories[] = {
# undef PERL_LOCALE_TABLE_ENTRY
# define PERL_LOCALE_TABLE_ENTRY(name, call_back) LC_ ## name,
# include "locale_table.h"
# ifdef LC_ALL
LC_ALL,
# else
FAKE_LC_ALL,
# endif
(FAKE_LC_ALL + 1) /* Entry for unknown category; this number is unlikely
to clash with a real category */
};
# define GET_NAME_AS_STRING(token) # token
# define GET_LC_NAME_AS_STRING(token) GET_NAME_AS_STRING(LC_ ## token)
/* The second array is the category names. */
STATIC const char * const category_names[] = {
# undef PERL_LOCALE_TABLE_ENTRY
# define PERL_LOCALE_TABLE_ENTRY(name, call_back) GET_LC_NAME_AS_STRING(name),
# include "locale_table.h"
# ifdef LC_ALL
# define LC_ALL_STRING "LC_ALL"
# else
# define LC_ALL_STRING "If you see this, it is a bug in perl;" \
" please report it via perlbug"
# endif
LC_ALL_STRING,
# define LC_UNKNOWN_STRING "Locale category unknown to Perl; if you see" \
" this, it is a bug in perl; please report it" \
" via perlbug"
LC_UNKNOWN_STRING
};
STATIC const Size_t category_name_lengths[] = {
# undef PERL_LOCALE_TABLE_ENTRY
# define PERL_LOCALE_TABLE_ENTRY(name, call_back) \
STRLENs(GET_LC_NAME_AS_STRING(name)),
# include "locale_table.h"
STRLENs(LC_ALL_STRING),
STRLENs(LC_UNKNOWN_STRING)
};
/* Each entry includes space for the '=' and ';' */
# undef PERL_LOCALE_TABLE_ENTRY
# define PERL_LOCALE_TABLE_ENTRY(name, call_back) \
+ STRLENs(GET_LC_NAME_AS_STRING(name)) + 2
STATIC const Size_t lc_all_boiler_plate_length = 1 /* space for trailing NUL */
# include "locale_table.h"
;
/* A few categories require additional setup when they are changed. This table
* points to the functions that do that setup */
STATIC void (*update_functions[]) (pTHX_ const char *, bool force) = {
# undef PERL_LOCALE_TABLE_ENTRY
# define PERL_LOCALE_TABLE_ENTRY(name, call_back) call_back,
# include "locale_table.h"
S_new_LC_ALL,
NULL, /* No update for unknown category */
};
# if defined(HAS_IGNORED_LOCALE_CATEGORIES_) \
|| defined(HAS_MISSING_LANGINFO_ITEM_)
/* Indicates if each category on this platform is available to use not in
* the C locale */
STATIC const bool category_available[] = {
# undef PERL_LOCALE_TABLE_ENTRY
# define PERL_LOCALE_TABLE_ENTRY(name, call_back) LC_ ## name ## _AVAIL_,
# include "locale_table.h"
# ifdef LC_ALL
true,
# else
false,
# endif
false /* LC_UNKNOWN_AVAIL_ */
};
# endif
# if defined(USE_POSIX_2008_LOCALE)
STATIC const int category_masks[] = {
# undef PERL_LOCALE_TABLE_ENTRY
# define PERL_LOCALE_TABLE_ENTRY(name, call_back) LC_ ## name ## _MASK,
# include "locale_table.h"
LC_ALL_MASK, /* Will rightly refuse to compile unless this is defined */
0 /* Empty mask for unknown category */
};
# endif
# if ! defined(PERL_LC_ALL_USES_NAME_VALUE_PAIRS)
/* On platforms that use positional notation for expressing LC_ALL, this maps
* the position of each category to our corresponding internal index for it.
* This is initialized at run time if needed. LC_ALL_INDEX_ is not legal for
* an individual locale, hence marks the elements here as not actually
* initialized. */
STATIC
unsigned int
map_LC_ALL_position_to_index[LC_ALL_INDEX_] = { LC_ALL_INDEX_ };
# endif
#endif
#if defined(USE_LOCALE) || defined(DEBUGGING)
STATIC const char *
S_get_displayable_string(pTHX_
const char * const s,
const char * const e,
const bool is_utf8)
{
PERL_ARGS_ASSERT_GET_DISPLAYABLE_STRING;
if (e <= s) {
return "";
}
const char * t = s;
bool prev_was_printable = TRUE;
bool first_time = TRUE;
char * ret;
/* Worst case scenario: All are non-printable so have a blank between each.
* If UTF-8, all are the largest possible code point; otherwise all are a
* single byte. '(2 + 1)' is from each byte takes 2 characters to
* display, and a blank (or NUL for the final one) after it */
const Size_t size = (e - s) * (2 + 1) * ((is_utf8) ? UVSIZE : 1);
Newxz(ret, size, char);
SAVEFREEPV(ret);
while (t < e) {
UV cp = (is_utf8)
? utf8_to_uvchr_buf((U8 *) t, e, NULL)
: * (U8 *) t;
if (isPRINT(cp)) {
if (! prev_was_printable) {
my_strlcat(ret, " ", size);
}
/* Escape these to avoid any ambiguity */
if (cp == ' ' || cp == '\\') {
my_strlcat(ret, "\\", size);
}
my_strlcat(ret, Perl_form(aTHX_ "%c", (U8) cp), size);
prev_was_printable = TRUE;
}
else {
if (! first_time) {
my_strlcat(ret, " ", size);
}
my_strlcat(ret, Perl_form(aTHX_ "%02" UVXf, cp), size);
prev_was_printable = FALSE;
}
t += (is_utf8) ? UTF8SKIP(t) : 1;
first_time = FALSE;
}
return ret;
}
#endif
#ifdef USE_LOCALE
# define get_category_index(cat) get_category_index_helper(cat, NULL, __LINE__)
STATIC locale_category_index
S_get_category_index_helper(pTHX_ const int category, bool * succeeded,
const line_t caller_line)
{
PERL_ARGS_ASSERT_GET_CATEGORY_INDEX_HELPER;
/* Given a category, return the equivalent internal index we generally use
* instead, warn or panic if not found. */
locale_category_index i;
# undef PERL_LOCALE_TABLE_ENTRY
# define PERL_LOCALE_TABLE_ENTRY(name, call_back) \
case LC_ ## name: i = LC_ ## name ## _INDEX_; break;
switch (category) {
# include "locale_table.h"
# ifdef LC_ALL
case LC_ALL: i = LC_ALL_INDEX_; break;