| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631 |
- #! /usr/bin/env python3
- # -*- coding: iso-8859-1 -*-
- # Originally written by Barry Warsaw <barry@python.org>
- #
- # Minimally patched to make it even more xgettext compatible
- # by Peter Funk <pf@artcom-gmbh.de>
- #
- # 2002-11-22 Jürgen Hermann <jh@web.de>
- # Added checks that _() only contains string literals, and
- # command line args are resolved to module lists, i.e. you
- # can now pass a filename, a module or package name, or a
- # directory (including globbing chars, important for Win32).
- # Made docstring fit in 80 chars wide displays using pydoc.
- #
- # for selftesting
- try:
- import fintl
- _ = fintl.gettext
- except ImportError:
- _ = lambda s: s
- __doc__ = _("""pygettext -- Python equivalent of xgettext(1)
- Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
- internationalization of C programs. Most of these tools are independent of
- the programming language and can be used from within Python programs.
- Martin von Loewis' work[1] helps considerably in this regard.
- There's one problem though; xgettext is the program that scans source code
- looking for message strings, but it groks only C (or C++). Python
- introduces a few wrinkles, such as dual quoting characters, triple quoted
- strings, and raw strings. xgettext understands none of this.
- Enter pygettext, which uses Python's standard tokenize module to scan
- Python source code, generating .pot files identical to what GNU xgettext[2]
- generates for C and C++ code. From there, the standard GNU tools can be
- used.
- A word about marking Python strings as candidates for translation. GNU
- xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
- and gettext_noop. But those can be a lot of text to include all over your
- code. C and C++ have a trick: they use the C preprocessor. Most
- internationalized C source includes a #define for gettext() to _() so that
- what has to be written in the source is much less. Thus these are both
- translatable strings:
- gettext("Translatable String")
- _("Translatable String")
- Python of course has no preprocessor so this doesn't work so well. Thus,
- pygettext searches only for _() by default, but see the -k/--keyword flag
- below for how to augment this.
- [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
- [2] http://www.gnu.org/software/gettext/gettext.html
- NOTE: pygettext attempts to be option and feature compatible with GNU
- xgettext where ever possible. However some options are still missing or are
- not fully implemented. Also, xgettext's use of command line switches with
- option arguments is broken, and in these cases, pygettext just defines
- additional switches.
- Usage: pygettext [options] inputfile ...
- Options:
- -a
- --extract-all
- Extract all strings.
- -d name
- --default-domain=name
- Rename the default output file from messages.pot to name.pot.
- -E
- --escape
- Replace non-ASCII characters with octal escape sequences.
- -D
- --docstrings
- Extract module, class, method, and function docstrings. These do
- not need to be wrapped in _() markers, and in fact cannot be for
- Python to consider them docstrings. (See also the -X option).
- -h
- --help
- Print this help message and exit.
- -k word
- --keyword=word
- Keywords to look for in addition to the default set, which are:
- %(DEFAULTKEYWORDS)s
- You can have multiple -k flags on the command line.
- -K
- --no-default-keywords
- Disable the default set of keywords (see above). Any keywords
- explicitly added with the -k/--keyword option are still recognized.
- --no-location
- Do not write filename/lineno location comments.
- -n
- --add-location
- Write filename/lineno location comments indicating where each
- extracted string is found in the source. These lines appear before
- each msgid. The style of comments is controlled by the -S/--style
- option. This is the default.
- -o filename
- --output=filename
- Rename the default output file from messages.pot to filename. If
- filename is `-' then the output is sent to standard out.
- -p dir
- --output-dir=dir
- Output files will be placed in directory dir.
- -S stylename
- --style stylename
- Specify which style to use for location comments. Two styles are
- supported:
- Solaris # File: filename, line: line-number
- GNU #: filename:line
- The style name is case insensitive. GNU style is the default.
- -v
- --verbose
- Print the names of the files being processed.
- -V
- --version
- Print the version of pygettext and exit.
- -w columns
- --width=columns
- Set width of output to columns.
- -x filename
- --exclude-file=filename
- Specify a file that contains a list of strings that are not be
- extracted from the input files. Each string to be excluded must
- appear on a line by itself in the file.
- -X filename
- --no-docstrings=filename
- Specify a file that contains a list of files (one per line) that
- should not have their docstrings extracted. This is only useful in
- conjunction with the -D option above.
- If `inputfile' is -, standard input is read.
- """)
- import os
- import importlib.machinery
- import importlib.util
- import sys
- import glob
- import time
- import getopt
- import token
- import tokenize
- __version__ = '1.5'
- default_keywords = ['_']
- DEFAULTKEYWORDS = ', '.join(default_keywords)
- EMPTYSTRING = ''
- # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
- # there.
- pot_header = _('''\
- # SOME DESCRIPTIVE TITLE.
- # Copyright (C) YEAR ORGANIZATION
- # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
- #
- msgid ""
- msgstr ""
- "Project-Id-Version: PACKAGE VERSION\\n"
- "POT-Creation-Date: %(time)s\\n"
- "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
- "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
- "Language-Team: LANGUAGE <LL@li.org>\\n"
- "MIME-Version: 1.0\\n"
- "Content-Type: text/plain; charset=%(charset)s\\n"
- "Content-Transfer-Encoding: %(encoding)s\\n"
- "Generated-By: pygettext.py %(version)s\\n"
- ''')
- def usage(code, msg=''):
- print(__doc__ % globals(), file=sys.stderr)
- if msg:
- print(msg, file=sys.stderr)
- sys.exit(code)
- def make_escapes(pass_nonascii):
- global escapes, escape
- if pass_nonascii:
- # Allow non-ascii characters to pass through so that e.g. 'msgid
- # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
- # escape any character outside the 32..126 range.
- mod = 128
- escape = escape_ascii
- else:
- mod = 256
- escape = escape_nonascii
- escapes = [r"\%03o" % i for i in range(mod)]
- for i in range(32, 127):
- escapes[i] = chr(i)
- escapes[ord('\\')] = r'\\'
- escapes[ord('\t')] = r'\t'
- escapes[ord('\r')] = r'\r'
- escapes[ord('\n')] = r'\n'
- escapes[ord('\"')] = r'\"'
- def escape_ascii(s, encoding):
- return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)
- def escape_nonascii(s, encoding):
- return ''.join(escapes[b] for b in s.encode(encoding))
- def is_literal_string(s):
- return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"')
- def safe_eval(s):
- # unwrap quotes, safely
- return eval(s, {'__builtins__':{}}, {})
- def normalize(s, encoding):
- # This converts the various Python string types into a format that is
- # appropriate for .po files, namely much closer to C style.
- lines = s.split('\n')
- if len(lines) == 1:
- s = '"' + escape(s, encoding) + '"'
- else:
- if not lines[-1]:
- del lines[-1]
- lines[-1] = lines[-1] + '\n'
- for i in range(len(lines)):
- lines[i] = escape(lines[i], encoding)
- lineterm = '\\n"\n"'
- s = '""\n"' + lineterm.join(lines) + '"'
- return s
- def containsAny(str, set):
- """Check whether 'str' contains ANY of the chars in 'set'"""
- return 1 in [c in str for c in set]
- def getFilesForName(name):
- """Get a list of module files for a filename, a module or package name,
- or a directory.
- """
- if not os.path.exists(name):
- # check for glob chars
- if containsAny(name, "*?[]"):
- files = glob.glob(name)
- list = []
- for file in files:
- list.extend(getFilesForName(file))
- return list
- # try to find module or package
- try:
- spec = importlib.util.find_spec(name)
- name = spec.origin
- except ImportError:
- name = None
- if not name:
- return []
- if os.path.isdir(name):
- # find all python files in directory
- list = []
- # get extension for python source files
- _py_ext = importlib.machinery.SOURCE_SUFFIXES[0]
- for root, dirs, files in os.walk(name):
- # don't recurse into CVS directories
- if 'CVS' in dirs:
- dirs.remove('CVS')
- # add all *.py files to list
- list.extend(
- [os.path.join(root, file) for file in files
- if os.path.splitext(file)[1] == _py_ext]
- )
- return list
- elif os.path.exists(name):
- # a single file
- return [name]
- return []
- class TokenEater:
- def __init__(self, options):
- self.__options = options
- self.__messages = {}
- self.__state = self.__waiting
- self.__data = []
- self.__lineno = -1
- self.__freshmodule = 1
- self.__curfile = None
- self.__enclosurecount = 0
- def __call__(self, ttype, tstring, stup, etup, line):
- # dispatch
- ## import token
- ## print('ttype:', token.tok_name[ttype], 'tstring:', tstring,
- ## file=sys.stderr)
- self.__state(ttype, tstring, stup[0])
- def __waiting(self, ttype, tstring, lineno):
- opts = self.__options
- # Do docstring extractions, if enabled
- if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
- # module docstring?
- if self.__freshmodule:
- if ttype == tokenize.STRING and is_literal_string(tstring):
- self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
- self.__freshmodule = 0
- elif ttype not in (tokenize.COMMENT, tokenize.NL):
- self.__freshmodule = 0
- return
- # class or func/method docstring?
- if ttype == tokenize.NAME and tstring in ('class', 'def'):
- self.__state = self.__suiteseen
- return
- if ttype == tokenize.NAME and tstring in opts.keywords:
- self.__state = self.__keywordseen
- def __suiteseen(self, ttype, tstring, lineno):
- # skip over any enclosure pairs until we see the colon
- if ttype == tokenize.OP:
- if tstring == ':' and self.__enclosurecount == 0:
- # we see a colon and we're not in an enclosure: end of def
- self.__state = self.__suitedocstring
- elif tstring in '([{':
- self.__enclosurecount += 1
- elif tstring in ')]}':
- self.__enclosurecount -= 1
- def __suitedocstring(self, ttype, tstring, lineno):
- # ignore any intervening noise
- if ttype == tokenize.STRING and is_literal_string(tstring):
- self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
- self.__state = self.__waiting
- elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
- tokenize.COMMENT):
- # there was no class docstring
- self.__state = self.__waiting
- def __keywordseen(self, ttype, tstring, lineno):
- if ttype == tokenize.OP and tstring == '(':
- self.__data = []
- self.__lineno = lineno
- self.__state = self.__openseen
- else:
- self.__state = self.__waiting
- def __openseen(self, ttype, tstring, lineno):
- if ttype == tokenize.OP and tstring == ')':
- # We've seen the last of the translatable strings. Record the
- # line number of the first line of the strings and update the list
- # of messages seen. Reset state for the next batch. If there
- # were no strings inside _(), then just ignore this entry.
- if self.__data:
- self.__addentry(EMPTYSTRING.join(self.__data))
- self.__state = self.__waiting
- elif ttype == tokenize.STRING and is_literal_string(tstring):
- self.__data.append(safe_eval(tstring))
- elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
- token.NEWLINE, tokenize.NL]:
- # warn if we see anything else than STRING or whitespace
- print(_(
- '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
- ) % {
- 'token': tstring,
- 'file': self.__curfile,
- 'lineno': self.__lineno
- }, file=sys.stderr)
- self.__state = self.__waiting
- def __addentry(self, msg, lineno=None, isdocstring=0):
- if lineno is None:
- lineno = self.__lineno
- if not msg in self.__options.toexclude:
- entry = (self.__curfile, lineno)
- self.__messages.setdefault(msg, {})[entry] = isdocstring
- def set_filename(self, filename):
- self.__curfile = filename
- self.__freshmodule = 1
- def write(self, fp):
- options = self.__options
- timestamp = time.strftime('%Y-%m-%d %H:%M%z')
- encoding = fp.encoding if fp.encoding else 'UTF-8'
- print(pot_header % {'time': timestamp, 'version': __version__,
- 'charset': encoding,
- 'encoding': '8bit'}, file=fp)
- # Sort the entries. First sort each particular entry's keys, then
- # sort all the entries by their first item.
- reverse = {}
- for k, v in self.__messages.items():
- keys = sorted(v.keys())
- reverse.setdefault(tuple(keys), []).append((k, v))
- rkeys = sorted(reverse.keys())
- for rkey in rkeys:
- rentries = reverse[rkey]
- rentries.sort()
- for k, v in rentries:
- # If the entry was gleaned out of a docstring, then add a
- # comment stating so. This is to aid translators who may wish
- # to skip translating some unimportant docstrings.
- isdocstring = any(v.values())
- # k is the message string, v is a dictionary-set of (filename,
- # lineno) tuples. We want to sort the entries in v first by
- # file name and then by line number.
- v = sorted(v.keys())
- if not options.writelocations:
- pass
- # location comments are different b/w Solaris and GNU:
- elif options.locationstyle == options.SOLARIS:
- for filename, lineno in v:
- d = {'filename': filename, 'lineno': lineno}
- print(_(
- '# File: %(filename)s, line: %(lineno)d') % d, file=fp)
- elif options.locationstyle == options.GNU:
- # fit as many locations on one line, as long as the
- # resulting line length doesn't exceed 'options.width'
- locline = '#:'
- for filename, lineno in v:
- d = {'filename': filename, 'lineno': lineno}
- s = _(' %(filename)s:%(lineno)d') % d
- if len(locline) + len(s) <= options.width:
- locline = locline + s
- else:
- print(locline, file=fp)
- locline = "#:" + s
- if len(locline) > 2:
- print(locline, file=fp)
- if isdocstring:
- print('#, docstring', file=fp)
- print('msgid', normalize(k, encoding), file=fp)
- print('msgstr ""\n', file=fp)
- def main():
- global default_keywords
- try:
- opts, args = getopt.getopt(
- sys.argv[1:],
- 'ad:DEhk:Kno:p:S:Vvw:x:X:',
- ['extract-all', 'default-domain=', 'escape', 'help',
- 'keyword=', 'no-default-keywords',
- 'add-location', 'no-location', 'output=', 'output-dir=',
- 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
- 'docstrings', 'no-docstrings',
- ])
- except getopt.error as msg:
- usage(1, msg)
- # for holding option values
- class Options:
- # constants
- GNU = 1
- SOLARIS = 2
- # defaults
- extractall = 0 # FIXME: currently this option has no effect at all.
- escape = 0
- keywords = []
- outpath = ''
- outfile = 'messages.pot'
- writelocations = 1
- locationstyle = GNU
- verbose = 0
- width = 78
- excludefilename = ''
- docstrings = 0
- nodocstrings = {}
- options = Options()
- locations = {'gnu' : options.GNU,
- 'solaris' : options.SOLARIS,
- }
- # parse options
- for opt, arg in opts:
- if opt in ('-h', '--help'):
- usage(0)
- elif opt in ('-a', '--extract-all'):
- options.extractall = 1
- elif opt in ('-d', '--default-domain'):
- options.outfile = arg + '.pot'
- elif opt in ('-E', '--escape'):
- options.escape = 1
- elif opt in ('-D', '--docstrings'):
- options.docstrings = 1
- elif opt in ('-k', '--keyword'):
- options.keywords.append(arg)
- elif opt in ('-K', '--no-default-keywords'):
- default_keywords = []
- elif opt in ('-n', '--add-location'):
- options.writelocations = 1
- elif opt in ('--no-location',):
- options.writelocations = 0
- elif opt in ('-S', '--style'):
- options.locationstyle = locations.get(arg.lower())
- if options.locationstyle is None:
- usage(1, _('Invalid value for --style: %s') % arg)
- elif opt in ('-o', '--output'):
- options.outfile = arg
- elif opt in ('-p', '--output-dir'):
- options.outpath = arg
- elif opt in ('-v', '--verbose'):
- options.verbose = 1
- elif opt in ('-V', '--version'):
- print(_('pygettext.py (xgettext for Python) %s') % __version__)
- sys.exit(0)
- elif opt in ('-w', '--width'):
- try:
- options.width = int(arg)
- except ValueError:
- usage(1, _('--width argument must be an integer: %s') % arg)
- elif opt in ('-x', '--exclude-file'):
- options.excludefilename = arg
- elif opt in ('-X', '--no-docstrings'):
- fp = open(arg)
- try:
- while 1:
- line = fp.readline()
- if not line:
- break
- options.nodocstrings[line[:-1]] = 1
- finally:
- fp.close()
- # calculate escapes
- make_escapes(not options.escape)
- # calculate all keywords
- options.keywords.extend(default_keywords)
- # initialize list of strings to exclude
- if options.excludefilename:
- try:
- fp = open(options.excludefilename)
- options.toexclude = fp.readlines()
- fp.close()
- except IOError:
- print(_(
- "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr)
- sys.exit(1)
- else:
- options.toexclude = []
- # resolve args to module lists
- expanded = []
- for arg in args:
- if arg == '-':
- expanded.append(arg)
- else:
- expanded.extend(getFilesForName(arg))
- args = expanded
- # slurp through all the files
- eater = TokenEater(options)
- for filename in args:
- if filename == '-':
- if options.verbose:
- print(_('Reading standard input'))
- fp = sys.stdin.buffer
- closep = 0
- else:
- if options.verbose:
- print(_('Working on %s') % filename)
- fp = open(filename, 'rb')
- closep = 1
- try:
- eater.set_filename(filename)
- try:
- tokens = tokenize.tokenize(fp.readline)
- for _token in tokens:
- eater(*_token)
- except tokenize.TokenError as e:
- print('%s: %s, line %d, column %d' % (
- e.args[0], filename, e.args[1][0], e.args[1][1]),
- file=sys.stderr)
- finally:
- if closep:
- fp.close()
- # write the output
- if options.outfile == '-':
- fp = sys.stdout
- closep = 0
- else:
- if options.outpath:
- options.outfile = os.path.join(options.outpath, options.outfile)
- fp = open(options.outfile, 'w')
- closep = 1
- try:
- eater.write(fp)
- finally:
- if closep:
- fp.close()
- if __name__ == '__main__':
- main()
- # some more test strings
- # this one creates a warning
- _('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
- _('more' 'than' 'one' 'string')
|