Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-104400: pygettext: Prepare to replace TokenEater with a NodeVisitor #129672

Merged
merged 1 commit into from
Feb 4, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 69 additions & 68 deletions Tools/i18n/pygettext.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,9 @@
the programming language and can be used from within Python programs.
Martin von Loewis' work[1] helps considerably in this regard.

There's one problem though; xgettext is the program that scans source code
looking for message strings, but it groks only C (or C++). Python
introduces a few wrinkles, such as dual quoting characters, triple quoted
strings, and raw strings. xgettext understands none of this.

Enter pygettext, which uses Python's standard tokenize module to scan
Python source code, generating .pot files identical to what GNU xgettext[2]
generates for C and C++ code. From there, the standard GNU tools can be
used.
pygettext uses Python's standard tokenize module to scan Python source
code, generating .pot files identical to what GNU xgettext[2] generates
for C and C++ code. From there, the standard GNU tools can be used.

A word about marking Python strings as candidates for translation. GNU
xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
Expand All @@ -41,6 +35,9 @@
option arguments is broken, and in these cases, pygettext just defines
additional switches.

NOTE: The public interface of pygettext is limited to the command-line
interface only. The internal API is subject to change without notice.

Usage: pygettext [options] inputfile ...

Options:
Expand Down Expand Up @@ -328,12 +325,6 @@ def add_location(self, filename, lineno, msgid_plural=None, *, is_docstring=Fals
self.is_docstring |= is_docstring


def key_for(msgid, msgctxt=None):
if msgctxt is not None:
return (msgctxt, msgid)
return msgid


class TokenEater:
def __init__(self, options):
self.__options = options
Expand All @@ -354,6 +345,10 @@ def __call__(self, ttype, tstring, stup, etup, line):
## file=sys.stderr)
self.__state(ttype, tstring, stup[0])

@property
def messages(self):
return self.__messages

def __waiting(self, ttype, tstring, lineno):
opts = self.__options
# Do docstring extractions, if enabled
Expand Down Expand Up @@ -513,7 +508,7 @@ def __addentry(self, msg, lineno=None, *, is_docstring=False):
lineno = self.__lineno
msgctxt = msg.get('msgctxt')
msgid_plural = msg.get('msgid_plural')
key = key_for(msgid, msgctxt)
key = self._key_for(msgid, msgctxt)
if key in self.__messages:
self.__messages[key].add_location(
self.__curfile,
Expand All @@ -530,6 +525,12 @@ def __addentry(self, msg, lineno=None, *, is_docstring=False):
is_docstring=is_docstring,
)

@staticmethod
def _key_for(msgid, msgctxt=None):
if msgctxt is not None:
return (msgctxt, msgid)
return msgid

def warn_unexpected_token(self, token):
print((
'*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
Expand All @@ -543,58 +544,58 @@ def set_filename(self, filename):
self.__curfile = filename
self.__freshmodule = 1

def write(self, fp):
options = self.__options
timestamp = time.strftime('%Y-%m-%d %H:%M%z')
encoding = fp.encoding if fp.encoding else 'UTF-8'
print(pot_header % {'time': timestamp, 'version': __version__,
'charset': encoding,
'encoding': '8bit'}, file=fp)

# Sort locations within each message by filename and lineno
sorted_keys = [
(key, sorted(msg.locations))
for key, msg in self.__messages.items()
]
# Sort messages by locations
# For example, a message with locations [('test.py', 1), ('test.py', 2)] will
# appear before a message with locations [('test.py', 1), ('test.py', 3)]
sorted_keys.sort(key=itemgetter(1))

for key, locations in sorted_keys:
msg = self.__messages[key]
if options.writelocations:
# location comments are different b/w Solaris and GNU:
if options.locationstyle == options.SOLARIS:
for location in locations:
print(f'# File: {location.filename}, line: {location.lineno}', file=fp)
elif options.locationstyle == options.GNU:
# fit as many locations on one line, as long as the
# resulting line length doesn't exceed 'options.width'
locline = '#:'
for location in locations:
s = f' {location.filename}:{location.lineno}'
if len(locline) + len(s) <= options.width:
locline = locline + s
else:
print(locline, file=fp)
locline = f'#:{s}'
if len(locline) > 2:

def write_pot_file(messages, options, fp):
timestamp = time.strftime('%Y-%m-%d %H:%M%z')
encoding = fp.encoding if fp.encoding else 'UTF-8'
print(pot_header % {'time': timestamp, 'version': __version__,
'charset': encoding,
'encoding': '8bit'}, file=fp)

# Sort locations within each message by filename and lineno
sorted_keys = [
(key, sorted(msg.locations))
for key, msg in messages.items()
]
# Sort messages by locations
# For example, a message with locations [('test.py', 1), ('test.py', 2)] will
# appear before a message with locations [('test.py', 1), ('test.py', 3)]
sorted_keys.sort(key=itemgetter(1))

for key, locations in sorted_keys:
msg = messages[key]
if options.writelocations:
# location comments are different b/w Solaris and GNU:
if options.locationstyle == options.SOLARIS:
for location in locations:
print(f'# File: {location.filename}, line: {location.lineno}', file=fp)
elif options.locationstyle == options.GNU:
# fit as many locations on one line, as long as the
# resulting line length doesn't exceed 'options.width'
locline = '#:'
for location in locations:
s = f' {location.filename}:{location.lineno}'
if len(locline) + len(s) <= options.width:
locline = locline + s
else:
print(locline, file=fp)
if msg.is_docstring:
# If the entry was gleaned out of a docstring, then add a
# comment stating so. This is to aid translators who may wish
# to skip translating some unimportant docstrings.
print('#, docstring', file=fp)
if msg.msgctxt is not None:
print('msgctxt', normalize(msg.msgctxt, encoding), file=fp)
print('msgid', normalize(msg.msgid, encoding), file=fp)
if msg.msgid_plural is not None:
print('msgid_plural', normalize(msg.msgid_plural, encoding), file=fp)
print('msgstr[0] ""', file=fp)
print('msgstr[1] ""\n', file=fp)
else:
print('msgstr ""\n', file=fp)
locline = f'#:{s}'
if len(locline) > 2:
print(locline, file=fp)
if msg.is_docstring:
# If the entry was gleaned out of a docstring, then add a
# comment stating so. This is to aid translators who may wish
# to skip translating some unimportant docstrings.
print('#, docstring', file=fp)
if msg.msgctxt is not None:
print('msgctxt', normalize(msg.msgctxt, encoding), file=fp)
print('msgid', normalize(msg.msgid, encoding), file=fp)
if msg.msgid_plural is not None:
print('msgid_plural', normalize(msg.msgid_plural, encoding), file=fp)
print('msgstr[0] ""', file=fp)
print('msgstr[1] ""\n', file=fp)
else:
print('msgstr ""\n', file=fp)


def main():
Expand Down Expand Up @@ -752,7 +753,7 @@ class Options:
fp = open(options.outfile, 'w')
closep = 1
try:
eater.write(fp)
write_pot_file(eater.messages, options, fp)
finally:
if closep:
fp.close()
Expand Down
Loading