Skip to content

Commit 9290868

Browse files
[3.12] gh-53203: Fix strptime() for %c, %x and %X formats on many locales (GH-125406) (GH-125454) (GH-125483)
Fixed most locales that use non-ASCII digits, like Persian, Burmese, Odia and Shan. (cherry picked from commit 5f4e5b5) (cherry picked from commit cbcdf34) Co-authored-by: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
1 parent f1a6f68 commit 9290868

File tree

4 files changed

+74
-42
lines changed

4 files changed

+74
-42
lines changed

Lib/_strptime.py

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import locale
1515
import calendar
1616
from re import compile as re_compile
17+
from re import sub as re_sub
1718
from re import IGNORECASE
1819
from re import escape as re_escape
1920
from datetime import (date as datetime_date,
@@ -128,11 +129,23 @@ def __calc_date_time(self):
128129
time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
129130
time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
130131
replacement_pairs = [
131-
('1999', '%Y'), ('99', '%y'), ('22', '%H'),
132-
('44', '%M'), ('55', '%S'), ('76', '%j'),
133-
('17', '%d'), ('03', '%m'), ('3', '%m'),
134-
# '3' needed for when no leading zero.
135-
('2', '%w'), ('10', '%I')]
132+
('1999', '%Y'), ('99', '%y'), ('22', '%H'),
133+
('44', '%M'), ('55', '%S'), ('76', '%j'),
134+
('17', '%d'), ('03', '%m'), ('3', '%m'),
135+
# '3' needed for when no leading zero.
136+
('2', '%w'), ('10', '%I'),
137+
# Non-ASCII digits
138+
('\u0661\u0669\u0669\u0669', '%Y'),
139+
('\u0669\u0669', '%Oy'),
140+
('\u0662\u0662', '%OH'),
141+
('\u0664\u0664', '%OM'),
142+
('\u0665\u0665', '%OS'),
143+
('\u0661\u0667', '%Od'),
144+
('\u0660\u0663', '%Om'),
145+
('\u0663', '%Om'),
146+
('\u0662', '%Ow'),
147+
('\u0661\u0660', '%OI'),
148+
]
136149
date_time = []
137150
for directive in ('%c', '%x', '%X'):
138151
current_format = time.strftime(directive, time_tuple).lower()
@@ -157,6 +170,10 @@ def __calc_date_time(self):
157170
for tz in tz_values:
158171
if tz:
159172
current_format = current_format.replace(tz, "%Z")
173+
# Transform all non-ASCII digits to digits in range U+0660 to U+0669.
174+
current_format = re_sub(r'\d(?<![0-9])',
175+
lambda m: chr(0x0660 + int(m[0])),
176+
current_format)
160177
for old, new in replacement_pairs:
161178
current_format = current_format.replace(old, new)
162179
# If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
@@ -266,7 +283,7 @@ def __init__(self, locale_time=None):
266283
else:
267284
self.locale_time = LocaleTime()
268285
base = super()
269-
base.__init__({
286+
mapping = {
270287
# The " [1-9]" part of the regex is to make %c from ANSI C work
271288
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
272289
'f': r"(?P<f>[0-9]{1,6})",
@@ -295,11 +312,15 @@ def __init__(self, locale_time=None):
295312
'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone
296313
for tz in tz_names),
297314
'Z'),
298-
'%': '%'})
299-
base.__setitem__('W', base.__getitem__('U').replace('U', 'W'))
300-
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
301-
base.__setitem__('x', self.pattern(self.locale_time.LC_date))
315+
'%': '%'}
316+
for d in 'dmyHIMS':
317+
mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
318+
mapping['Ow'] = r'(?P<w>\d)'
319+
mapping['W'] = mapping['U'].replace('U', 'W')
320+
base.__init__(mapping)
302321
base.__setitem__('X', self.pattern(self.locale_time.LC_time))
322+
base.__setitem__('x', self.pattern(self.locale_time.LC_date))
323+
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
303324

304325
def __seqToRE(self, to_convert, directive):
305326
"""Convert a list to a regex string for matching a directive.
@@ -327,21 +348,16 @@ def pattern(self, format):
327348
regex syntax are escaped.
328349
329350
"""
330-
processed_format = ''
331351
# The sub() call escapes all characters that might be misconstrued
332352
# as regex syntax. Cannot use re.escape since we have to deal with
333353
# format directives (%m, etc.).
334-
regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])")
335-
format = regex_chars.sub(r"\\\1", format)
336-
whitespace_replacement = re_compile(r'\s+')
337-
format = whitespace_replacement.sub(r'\\s+', format)
338-
while '%' in format:
339-
directive_index = format.index('%')+1
340-
processed_format = "%s%s%s" % (processed_format,
341-
format[:directive_index-1],
342-
self[format[directive_index]])
343-
format = format[directive_index+1:]
344-
return "%s%s" % (processed_format, format)
354+
format = re_sub(r"([\\.^$*+?\(\){}\[\]|])", r"\\\1", format)
355+
format = re_sub(r'\s+', r'\\s+', format)
356+
format = re_sub(r"'", "['\u02bc]", format) # needed for br_FR
357+
def repl(m):
358+
return self[m[1]]
359+
format = re_sub(r'%(O?.)', repl, format)
360+
return format
345361

346362
def compile(self, format):
347363
"""Return a compiled re object for the format string."""
@@ -415,8 +431,8 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
415431
_regex_cache[format] = format_regex
416432
found = format_regex.match(data_string)
417433
if not found:
418-
raise ValueError("time data %r does not match format %r :: /%s/" %
419-
(data_string, format, format_regex.pattern))
434+
raise ValueError("time data %r does not match format %r" %
435+
(data_string, format))
420436
if len(data_string) != found.end():
421437
raise ValueError("unconverted data remains: %s" %
422438
data_string[found.end():])

Lib/test/test_strptime.py

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ def test_strptime_exception_context(self):
290290
# additional check for IndexError branch (issue #19545)
291291
with self.assertRaises(ValueError) as e:
292292
_strptime._strptime_time('19', '%Y %')
293-
self.assertIs(e.exception.__suppress_context__, True)
293+
self.assertIsNone(e.exception.__context__)
294294

295295
def test_unconverteddata(self):
296296
# Check ValueError is raised when there is unconverted data
@@ -483,12 +483,14 @@ def test_bad_timezone(self):
483483
# id_ID, ms_MY.
484484
# * Year is not included: ha_NG.
485485
# * Use non-Gregorian calendar: lo_LA, thai, th_TH.
486+
# On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
486487
#
487488
# BUG: Generates regexp that does not match the current date and time
488-
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
489+
# for lzh_TW.
489490
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
490491
'he_IL', 'eu_ES', 'ar_AE', 'mfe_MU', 'yo_NG',
491-
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
492+
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN',
493+
'my_MM', 'or_IN', 'shn_MM', 'az_IR')
492494
def test_date_time_locale(self):
493495
# Test %c directive
494496
loc = locale.getlocale(locale.LC_TIME)[0]
@@ -510,20 +512,23 @@ def test_date_time_locale(self):
510512
self.roundtrip('%c', slice(0, 6), time.localtime(now - 366*24*3600))
511513

512514
# NB: Dates before 1969 do not roundtrip on some locales:
513-
# bo_CN, bo_IN, dz_BT, eu_ES, eu_FR.
515+
# az_IR, bo_CN, bo_IN, dz_BT, eu_ES, eu_FR, fa_IR, or_IN.
514516
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
515517
'he_IL', 'ar_AE', 'mfe_MU', 'yo_NG',
516-
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
518+
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN',
519+
'my_MM', 'shn_MM')
517520
def test_date_time_locale2(self):
518521
# Test %c directive
519522
self.roundtrip('%c', slice(0, 6), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
523+
self.roundtrip('%c', slice(0, 6), (1800, 1, 1, 0, 0, 0, 0, 1, 0))
520524

521525
# NB: Does not roundtrip because use non-Gregorian calendar:
522-
# lo_LA, thai, th_TH.
526+
# lo_LA, thai, th_TH. On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
523527
# BUG: Generates regexp that does not match the current date
524-
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
528+
# for lzh_TW.
525529
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
526-
'he_IL', 'eu_ES', 'ar_AE')
530+
'he_IL', 'eu_ES', 'ar_AE',
531+
'az_IR', 'my_MM', 'or_IN', 'shn_MM')
527532
def test_date_locale(self):
528533
# Test %x directive
529534
now = time.time()
@@ -543,30 +548,39 @@ def test_date_locale(self):
543548
"musl libc issue on Emscripten, bpo-46390"
544549
)
545550
@run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
546-
'eu_ES', 'ar_AE')
551+
'eu_ES', 'ar_AE', 'my_MM', 'shn_MM')
547552
def test_date_locale2(self):
548553
# Test %x directive
549554
self.roundtrip('%x', slice(0, 3), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
555+
self.roundtrip('%x', slice(0, 3), (1800, 1, 1, 0, 0, 0, 0, 1, 0))
550556

551557
# NB: Does not roundtrip in some locales due to the ambiguity of
552558
# the time representation (bugs in locales?):
553559
# * Seconds are not included: bokmal, ff_SN, nb_NO, nn_NO, no_NO,
554560
# norwegian, nynorsk.
555561
# * Hours are in 12-hour notation without AM/PM indication: hy_AM,
556562
# ms_MY, sm_WS.
557-
# BUG: Generates regexp that does not match the current time for
558-
# aa_DJ, aa_ER, aa_ET, am_ET, az_IR, byn_ER, fa_IR, gez_ER, gez_ET,
559-
# lzh_TW, my_MM, om_ET, om_KE, or_IN, shn_MM, sid_ET, so_DJ, so_ET,
560-
# so_SO, ti_ER, ti_ET, tig_ER, wal_ET.
561-
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP')
563+
# BUG: Generates regexp that does not match the current time for lzh_TW.
564+
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
565+
'aa_ET', 'am_ET', 'az_IR', 'byn_ER', 'fa_IR', 'gez_ET',
566+
'my_MM', 'om_ET', 'or_IN', 'shn_MM', 'sid_ET', 'so_SO',
567+
'ti_ET', 'tig_ER', 'wal_ET')
562568
def test_time_locale(self):
563569
# Test %X directive
570+
loc = locale.getlocale(locale.LC_TIME)[0]
571+
pos = slice(3, 6)
572+
if glibc_ver and glibc_ver < (2, 29) and loc in {
573+
'aa_ET', 'am_ET', 'byn_ER', 'gez_ET', 'om_ET',
574+
'sid_ET', 'so_SO', 'ti_ET', 'tig_ER', 'wal_ET'}:
575+
# Hours are in 12-hour notation without AM/PM indication.
576+
# Ignore hours.
577+
pos = slice(4, 6)
564578
now = time.time()
565-
self.roundtrip('%X', slice(3, 6), time.localtime(now))
579+
self.roundtrip('%X', pos, time.localtime(now))
566580
# 1 hour 20 minutes 30 seconds ago
567-
self.roundtrip('%X', slice(3, 6), time.localtime(now - 4830))
581+
self.roundtrip('%X', pos, time.localtime(now - 4830))
568582
# 12 hours ago
569-
self.roundtrip('%X', slice(3, 6), time.localtime(now - 12*3600))
583+
self.roundtrip('%X', pos, time.localtime(now - 12*3600))
570584

571585
def test_percent(self):
572586
# Make sure % signs are handled properly

Lib/test/test_time.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ def test_strptime_exception_context(self):
292292
# additional check for IndexError branch (issue #19545)
293293
with self.assertRaises(ValueError) as e:
294294
time.strptime('19', '%Y %')
295-
self.assertIs(e.exception.__suppress_context__, True)
295+
self.assertIsNone(e.exception.__context__)
296296

297297
def test_asctime(self):
298298
time.asctime(time.gmtime(self.t))
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix :func:`time.strptime` for ``%c``, ``%x`` and ``%X`` formats in many
2+
locales that use non-ASCII digits, like Persian, Burmese, Odia and Shan.

0 commit comments

Comments
 (0)