dffa7b7d0b
Change-Id: Iab375e8c8aa4c4915f3c70a9ef6aede268e4619f Reviewed-on: https://gerrit.libreoffice.org/c/core/+/171138 Tested-by: Jenkins Reviewed-by: Ilmari Lauhakangas <ilmari.lauhakangas@libreoffice.org> Tested-by: Ilmari Lauhakangas <ilmari.lauhakangas@libreoffice.org>
1868 lines
62 KiB
Python
1868 lines
62 KiB
Python
# -* coding: utf-8 -*-
|
|
#
|
|
# License: MIT (see LICENSE file provided)
|
|
# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
|
|
8
|
|
"""
|
|
**polib** allows you to manipulate, create, modify gettext files (pot, po and
|
|
mo files). You can load existing files, iterate through it's entries, add,
|
|
modify entries, comments or metadata, etc. or create new po files from scratch.
|
|
|
|
**polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
|
|
:func:`~polib.mofile` convenience functions.
|
|
"""
|
|
|
|
__author__ = 'David Jean Louis <izimobil@gmail.com>'
|
|
__version__ = '1.0.8'
|
|
__all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
|
|
'default_encoding', 'escape', 'unescape', 'detect_encoding', ]
|
|
|
|
import array
|
|
import codecs
|
|
import os
|
|
import re
|
|
import struct
|
|
import sys
|
|
import textwrap
|
|
import binascii
|
|
|
|
try:
|
|
import io
|
|
except ImportError:
|
|
# replacement of io.open() for python < 2.6
|
|
# we use codecs instead
|
|
class io(object):
|
|
@staticmethod
|
|
def open(fpath, mode='r', encoding=None):
|
|
return codecs.open(fpath, mode, encoding)
|
|
|
|
|
|
# the default encoding to use when encoding cannot be detected
|
|
default_encoding = 'utf-8'
|
|
|
|
# python 2/3 compatibility helpers {{{
|
|
|
|
|
|
if sys.version_info < (3, 0):
|
|
PY3 = False
|
|
text_type = unicode
|
|
|
|
def b(s):
|
|
return s
|
|
|
|
def u(s):
|
|
return unicode(s, "unicode_escape")
|
|
|
|
else:
|
|
PY3 = True
|
|
text_type = str
|
|
|
|
def b(s):
|
|
return s.encode("latin-1")
|
|
|
|
def u(s):
|
|
return s
|
|
# }}}
|
|
# _pofile_or_mofile {{{
|
|
|
|
|
|
def _pofile_or_mofile(f, type, **kwargs):
|
|
"""
|
|
Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
|
|
honor the DRY concept.
|
|
"""
|
|
# get the file encoding
|
|
enc = kwargs.get('encoding')
|
|
if enc is None:
|
|
enc = detect_encoding(f, type == 'mofile')
|
|
|
|
# parse the file
|
|
kls = type == 'pofile' and _POFileParser or _MOFileParser
|
|
parser = kls(
|
|
f,
|
|
encoding=enc,
|
|
check_for_duplicates=kwargs.get('check_for_duplicates', False),
|
|
klass=kwargs.get('klass')
|
|
)
|
|
instance = parser.parse()
|
|
instance.wrapwidth = kwargs.get('wrapwidth', 78)
|
|
return instance
|
|
# }}}
|
|
# _is_file {{{
|
|
|
|
|
|
def _is_file(filename_or_contents):
|
|
"""
|
|
Safely returns the value of os.path.exists(filename_or_contents).
|
|
|
|
Arguments:
|
|
|
|
``filename_or_contents``
|
|
either a filename, or a string holding the contents of some file.
|
|
In the latter case, this function will always return False.
|
|
"""
|
|
try:
|
|
return os.path.exists(filename_or_contents)
|
|
except (ValueError, UnicodeEncodeError):
|
|
return False
|
|
# }}}
|
|
# function pofile() {{{
|
|
|
|
|
|
def pofile(pofile, **kwargs):
|
|
"""
|
|
Convenience function that parses the po or pot file ``pofile`` and returns
|
|
a :class:`~polib.POFile` instance.
|
|
|
|
Arguments:
|
|
|
|
``pofile``
|
|
string, full or relative path to the po/pot file or its content (data).
|
|
|
|
``wrapwidth``
|
|
integer, the wrap width, only useful when the ``-w`` option was passed
|
|
to xgettext (optional, default: ``78``).
|
|
|
|
``encoding``
|
|
string, the encoding to use (e.g. "utf-8") (default: ``None``, the
|
|
encoding will be auto-detected).
|
|
|
|
``check_for_duplicates``
|
|
whether to check for duplicate entries when adding entries to the
|
|
file (optional, default: ``False``).
|
|
|
|
``klass``
|
|
class which is used to instantiate the return value (optional,
|
|
default: ``None``, the return value with be a :class:`~polib.POFile`
|
|
instance).
|
|
"""
|
|
return _pofile_or_mofile(pofile, 'pofile', **kwargs)
|
|
# }}}
|
|
# function mofile() {{{
|
|
|
|
|
|
def mofile(mofile, **kwargs):
|
|
"""
|
|
Convenience function that parses the mo file ``mofile`` and returns a
|
|
:class:`~polib.MOFile` instance.
|
|
|
|
Arguments:
|
|
|
|
``mofile``
|
|
string, full or relative path to the mo file or its content (data).
|
|
|
|
``wrapwidth``
|
|
integer, the wrap width, only useful when the ``-w`` option was passed
|
|
to xgettext to generate the po file that was used to format the mo file
|
|
(optional, default: ``78``).
|
|
|
|
``encoding``
|
|
string, the encoding to use (e.g. "utf-8") (default: ``None``, the
|
|
encoding will be auto-detected).
|
|
|
|
``check_for_duplicates``
|
|
whether to check for duplicate entries when adding entries to the
|
|
file (optional, default: ``False``).
|
|
|
|
``klass``
|
|
class which is used to instantiate the return value (optional,
|
|
default: ``None``, the return value with be a :class:`~polib.POFile`
|
|
instance).
|
|
"""
|
|
return _pofile_or_mofile(mofile, 'mofile', **kwargs)
|
|
# }}}
|
|
# function detect_encoding() {{{
|
|
|
|
|
|
def detect_encoding(file, binary_mode=False):
|
|
"""
|
|
Try to detect the encoding used by the ``file``. The ``file`` argument can
|
|
be a PO or MO file path or a string containing the contents of the file.
|
|
If the encoding cannot be detected, the function will return the value of
|
|
``default_encoding``.
|
|
|
|
Arguments:
|
|
|
|
``file``
|
|
string, full or relative path to the po/mo file or its content.
|
|
|
|
``binary_mode``
|
|
boolean, set this to True if ``file`` is a mo file.
|
|
"""
|
|
PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
|
|
rxt = re.compile(u(PATTERN))
|
|
rxb = re.compile(b(PATTERN))
|
|
|
|
def charset_exists(charset):
|
|
"""Check whether ``charset`` is valid or not."""
|
|
try:
|
|
codecs.lookup(charset)
|
|
except LookupError:
|
|
return False
|
|
return True
|
|
|
|
if not _is_file(file):
|
|
match = rxt.search(file)
|
|
if match:
|
|
enc = match.group(1).strip()
|
|
if charset_exists(enc):
|
|
return enc
|
|
else:
|
|
# For PY3, always treat as binary
|
|
if binary_mode or PY3:
|
|
mode = 'rb'
|
|
rx = rxb
|
|
else:
|
|
mode = 'r'
|
|
rx = rxt
|
|
f = open(file, mode)
|
|
for l in f.readlines():
|
|
match = rx.search(l)
|
|
if match:
|
|
f.close()
|
|
enc = match.group(1).strip()
|
|
if not isinstance(enc, text_type):
|
|
enc = enc.decode('utf-8')
|
|
if charset_exists(enc):
|
|
return enc
|
|
f.close()
|
|
return default_encoding
|
|
# }}}
|
|
# function escape() {{{
|
|
|
|
|
|
def escape(st):
|
|
"""
|
|
Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
|
|
the given string ``st`` and returns it.
|
|
"""
|
|
return st.replace('\\', r'\\')\
|
|
.replace('\t', r'\t')\
|
|
.replace('\r', r'\r')\
|
|
.replace('\n', r'\n')\
|
|
.replace('\"', r'\"')
|
|
# }}}
|
|
# function unescape() {{{
|
|
|
|
|
|
def unescape(st):
|
|
"""
|
|
Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
|
|
the given string ``st`` and returns it.
|
|
"""
|
|
def unescape_repl(m):
|
|
m = m.group(1)
|
|
if m == 'n':
|
|
return '\n'
|
|
if m == 't':
|
|
return '\t'
|
|
if m == 'r':
|
|
return '\r'
|
|
if m == '\\':
|
|
return '\\'
|
|
return m # handles escaped double quote
|
|
return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
|
|
# }}}
|
|
# function natural_sort() {{{
|
|
|
|
|
|
def natural_sort(lst):
|
|
"""
|
|
Sort naturally the given list.
|
|
Credits: http://stackoverflow.com/a/4836734
|
|
"""
|
|
convert = lambda text: int(text) if text.isdigit() else text.lower()
|
|
alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
|
|
return sorted(lst, key = alphanum_key)
|
|
# }}}
|
|
# class _BaseFile {{{
|
|
|
|
|
|
class _BaseFile(list):
|
|
"""
|
|
Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
|
|
classes. This class should **not** be instantiated directly.
|
|
"""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
"""
|
|
Constructor, accepts the following keyword arguments:
|
|
|
|
``pofile``
|
|
string, the path to the po or mo file, or its content as a string.
|
|
|
|
``wrapwidth``
|
|
integer, the wrap width, only useful when the ``-w`` option was
|
|
passed to xgettext (optional, default: ``78``).
|
|
|
|
``encoding``
|
|
string, the encoding to use, defaults to ``default_encoding``
|
|
global variable (optional).
|
|
|
|
``check_for_duplicates``
|
|
whether to check for duplicate entries when adding entries to the
|
|
file, (optional, default: ``False``).
|
|
"""
|
|
list.__init__(self)
|
|
# the opened file handle
|
|
pofile = kwargs.get('pofile', None)
|
|
if pofile and _is_file(pofile):
|
|
self.fpath = pofile
|
|
else:
|
|
self.fpath = kwargs.get('fpath')
|
|
# the width at which lines should be wrapped
|
|
self.wrapwidth = kwargs.get('wrapwidth', 78)
|
|
# the file encoding
|
|
self.encoding = kwargs.get('encoding', default_encoding)
|
|
# whether to check for duplicate entries or not
|
|
self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
|
|
# header
|
|
self.header = ''
|
|
# both po and mo files have metadata
|
|
self.metadata = {}
|
|
self.metadata_is_fuzzy = 0
|
|
|
|
def __unicode__(self):
|
|
"""
|
|
Returns the unicode representation of the file.
|
|
"""
|
|
ret = []
|
|
entries = [self.metadata_as_entry()] + \
|
|
[e for e in self if not e.obsolete]
|
|
for entry in entries:
|
|
ret.append(entry.__unicode__(self.wrapwidth))
|
|
for entry in self.obsolete_entries():
|
|
ret.append(entry.__unicode__(self.wrapwidth))
|
|
ret = u('\n').join(ret)
|
|
|
|
assert isinstance(ret, text_type)
|
|
#if type(ret) != text_type:
|
|
# return unicode(ret, self.encoding)
|
|
return ret
|
|
|
|
if PY3:
|
|
def __str__(self):
|
|
return self.__unicode__()
|
|
else:
|
|
def __str__(self):
|
|
"""
|
|
Returns the string representation of the file.
|
|
"""
|
|
return unicode(self).encode(self.encoding)
|
|
|
|
def __contains__(self, entry):
|
|
"""
|
|
Overridden ``list`` method to implement the membership test (in and
|
|
not in).
|
|
The method considers that an entry is in the file if it finds an entry
|
|
that has the same msgid (the test is **case sensitive**) and the same
|
|
msgctxt (or none for both entries).
|
|
|
|
Argument:
|
|
|
|
``entry``
|
|
an instance of :class:`~polib._BaseEntry`.
|
|
"""
|
|
return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \
|
|
is not None
|
|
|
|
def __eq__(self, other):
|
|
return str(self) == str(other)
|
|
|
|
def append(self, entry):
|
|
"""
|
|
Overridden method to check for duplicates entries, if a user tries to
|
|
add an entry that is already in the file, the method will raise a
|
|
``ValueError`` exception.
|
|
|
|
Argument:
|
|
|
|
``entry``
|
|
an instance of :class:`~polib._BaseEntry`.
|
|
"""
|
|
# check_for_duplicates may not be defined (yet) when unpickling.
|
|
# But if pickling, we never want to check for duplicates anyway.
|
|
if getattr(self, 'check_for_duplicates', False) and entry in self:
|
|
raise ValueError('Entry "%s" already exists' % entry.msgid)
|
|
super(_BaseFile, self).append(entry)
|
|
|
|
def insert(self, index, entry):
|
|
"""
|
|
Overridden method to check for duplicates entries, if a user tries to
|
|
add an entry that is already in the file, the method will raise a
|
|
``ValueError`` exception.
|
|
|
|
Arguments:
|
|
|
|
``index``
|
|
index at which the entry should be inserted.
|
|
|
|
``entry``
|
|
an instance of :class:`~polib._BaseEntry`.
|
|
"""
|
|
if self.check_for_duplicates and entry in self:
|
|
raise ValueError('Entry "%s" already exists' % entry.msgid)
|
|
super(_BaseFile, self).insert(index, entry)
|
|
|
|
def metadata_as_entry(self):
|
|
"""
|
|
Returns the file metadata as a :class:`~polib.POFile` instance.
|
|
"""
|
|
e = POEntry(msgid='')
|
|
mdata = self.ordered_metadata()
|
|
if mdata:
|
|
strs = []
|
|
for name, value in mdata:
|
|
# Strip whitespace off each line in a multi-line entry
|
|
strs.append('%s: %s' % (name, value))
|
|
e.msgstr = '\n'.join(strs) + '\n'
|
|
if self.metadata_is_fuzzy:
|
|
e.flags.append('fuzzy')
|
|
return e
|
|
|
|
def save(self, fpath=None, repr_method='__unicode__'):
|
|
"""
|
|
Saves the po file to ``fpath``.
|
|
If it is an existing file and no ``fpath`` is provided, then the
|
|
existing file is rewritten with the modified data.
|
|
|
|
Keyword arguments:
|
|
|
|
``fpath``
|
|
string, full or relative path to the file.
|
|
|
|
``repr_method``
|
|
string, the method to use for output.
|
|
"""
|
|
if self.fpath is None and fpath is None:
|
|
raise IOError('You must provide a file path to save() method')
|
|
contents = getattr(self, repr_method)()
|
|
if fpath is None:
|
|
fpath = self.fpath
|
|
if repr_method == 'to_binary':
|
|
fhandle = open(fpath, 'wb')
|
|
else:
|
|
fhandle = io.open(fpath, 'w', encoding=self.encoding)
|
|
if not isinstance(contents, text_type):
|
|
contents = contents.decode(self.encoding)
|
|
fhandle.write(contents)
|
|
fhandle.close()
|
|
# set the file path if not set
|
|
if self.fpath is None and fpath:
|
|
self.fpath = fpath
|
|
|
|
def find(self, st, by='msgid', include_obsolete_entries=False,
|
|
msgctxt=False):
|
|
"""
|
|
Find the entry which msgid (or property identified by the ``by``
|
|
argument) matches the string ``st``.
|
|
|
|
Keyword arguments:
|
|
|
|
``st``
|
|
string, the string to search for.
|
|
|
|
``by``
|
|
string, the property to use for comparison (default: ``msgid``).
|
|
|
|
``include_obsolete_entries``
|
|
boolean, whether to also search in entries that are obsolete.
|
|
|
|
``msgctxt``
|
|
string, allows specifying a specific message context for the
|
|
search.
|
|
"""
|
|
if include_obsolete_entries:
|
|
entries = self[:]
|
|
else:
|
|
entries = [e for e in self if not e.obsolete]
|
|
for e in entries:
|
|
if getattr(e, by) == st:
|
|
if msgctxt is not False and e.msgctxt != msgctxt:
|
|
continue
|
|
return e
|
|
return None
|
|
|
|
def ordered_metadata(self):
|
|
"""
|
|
Convenience method that returns an ordered version of the metadata
|
|
dictionary. The return value is list of tuples (metadata name,
|
|
metadata_value).
|
|
"""
|
|
# copy the dict first
|
|
metadata = self.metadata.copy()
|
|
data_order = [
|
|
'Project-Id-Version',
|
|
'Report-Msgid-Bugs-To',
|
|
'POT-Creation-Date',
|
|
'PO-Revision-Date',
|
|
'Last-Translator',
|
|
'Language-Team',
|
|
'Language',
|
|
'MIME-Version',
|
|
'Content-Type',
|
|
'Content-Transfer-Encoding',
|
|
'Plural-Forms'
|
|
]
|
|
ordered_data = []
|
|
for data in data_order:
|
|
try:
|
|
value = metadata.pop(data)
|
|
ordered_data.append((data, value))
|
|
except KeyError:
|
|
pass
|
|
# the rest of the metadata will be alphabetically ordered since there
|
|
# are no specs for this AFAIK
|
|
for data in natural_sort(metadata.keys()):
|
|
value = metadata[data]
|
|
ordered_data.append((data, value))
|
|
return ordered_data
|
|
|
|
def to_binary(self):
|
|
"""
|
|
Return the binary representation of the file.
|
|
"""
|
|
offsets = []
|
|
entries = self.translated_entries()
|
|
|
|
# the keys are sorted in the .mo file
|
|
def cmp(_self, other):
|
|
# msgfmt compares entries with msgctxt if it exists
|
|
self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid
|
|
other_msgid = other.msgctxt and other.msgctxt or other.msgid
|
|
if self_msgid > other_msgid:
|
|
return 1
|
|
elif self_msgid < other_msgid:
|
|
return -1
|
|
else:
|
|
return 0
|
|
# add metadata entry
|
|
entries.sort(key=lambda o: o.msgctxt or o.msgid)
|
|
mentry = self.metadata_as_entry()
|
|
#mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip()
|
|
entries = [mentry] + entries
|
|
entries_len = len(entries)
|
|
ids, strs = b(''), b('')
|
|
for e in entries:
|
|
# For each string, we need size and file offset. Each string is
|
|
# NUL terminated; the NUL does not count into the size.
|
|
msgid = b('')
|
|
if e.msgctxt:
|
|
# Contexts are stored by storing the concatenation of the
|
|
# context, a <EOT> byte, and the original string
|
|
msgid = self._encode(e.msgctxt + '\4')
|
|
if e.msgid_plural:
|
|
msgstr = []
|
|
for index in sorted(e.msgstr_plural.keys()):
|
|
msgstr.append(e.msgstr_plural[index])
|
|
msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
|
|
msgstr = self._encode('\0'.join(msgstr))
|
|
else:
|
|
msgid += self._encode(e.msgid)
|
|
msgstr = self._encode(e.msgstr)
|
|
offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
|
|
ids += msgid + b('\0')
|
|
strs += msgstr + b('\0')
|
|
|
|
# The header is 7 32-bit unsigned integers.
|
|
keystart = 7 * 4 + 16 * entries_len
|
|
# and the values start after the keys
|
|
valuestart = keystart + len(ids)
|
|
koffsets = []
|
|
voffsets = []
|
|
# The string table first has the list of keys, then the list of values.
|
|
# Each entry has first the size of the string, then the file offset.
|
|
for o1, l1, o2, l2 in offsets:
|
|
koffsets += [l1, o1 + keystart]
|
|
voffsets += [l2, o2 + valuestart]
|
|
offsets = koffsets + voffsets
|
|
|
|
output = struct.pack(
|
|
"Iiiiiii",
|
|
# Magic number
|
|
MOFile.MAGIC,
|
|
# Version
|
|
0,
|
|
# number of entries
|
|
entries_len,
|
|
# start of key index
|
|
7 * 4,
|
|
# start of value index
|
|
7 * 4 + entries_len * 8,
|
|
# size and offset of hash table, we don't use hash tables
|
|
0, keystart
|
|
|
|
)
|
|
if sys.version_info >= (3, 2):
|
|
output += array.array("i", offsets).tobytes()
|
|
else:
|
|
output += array.array("i", offsets).tostring()
|
|
output += ids
|
|
output += strs
|
|
return output
|
|
|
|
def _encode(self, mixed):
|
|
"""
|
|
Encodes the given ``mixed`` argument with the file encoding if and
|
|
only if it's a unicode string and returns the encoded string.
|
|
"""
|
|
if isinstance(mixed, text_type):
|
|
mixed = mixed.encode(self.encoding)
|
|
return mixed
|
|
# }}}
|
|
# class POFile {{{
|
|
|
|
|
|
class POFile(_BaseFile):
|
|
"""
|
|
Po (or Pot) file reader/writer.
|
|
This class inherits the :class:`~polib._BaseFile` class and, by extension,
|
|
the python ``list`` type.
|
|
"""
|
|
|
|
def __unicode__(self):
|
|
"""
|
|
Returns the unicode representation of the po file.
|
|
"""
|
|
ret, headers = '', self.header.split('\n')
|
|
for header in headers:
|
|
if not len(header):
|
|
ret += "#\n"
|
|
elif header[:1] in [',', ':']:
|
|
ret += '#%s\n' % header
|
|
else:
|
|
ret += '# %s\n' % header
|
|
|
|
if not isinstance(ret, text_type):
|
|
ret = ret.decode(self.encoding)
|
|
|
|
return ret + _BaseFile.__unicode__(self)
|
|
|
|
def save_as_mofile(self, fpath):
|
|
"""
|
|
Saves the binary representation of the file to given ``fpath``.
|
|
|
|
Keyword argument:
|
|
|
|
``fpath``
|
|
string, full or relative path to the mo file.
|
|
"""
|
|
_BaseFile.save(self, fpath, 'to_binary')
|
|
|
|
def percent_translated(self):
|
|
"""
|
|
Convenience method that returns the percentage of translated
|
|
messages.
|
|
"""
|
|
total = len([e for e in self if not e.obsolete])
|
|
if total == 0:
|
|
return 100
|
|
translated = len(self.translated_entries())
|
|
return int(translated * 100 / float(total))
|
|
|
|
def translated_entries(self):
|
|
"""
|
|
Convenience method that returns the list of translated entries.
|
|
"""
|
|
return [e for e in self if e.translated()]
|
|
|
|
def untranslated_entries(self):
|
|
"""
|
|
Convenience method that returns the list of untranslated entries.
|
|
"""
|
|
return [e for e in self if not e.translated() and not e.obsolete
|
|
and not 'fuzzy' in e.flags]
|
|
|
|
def fuzzy_entries(self):
|
|
"""
|
|
Convenience method that returns the list of fuzzy entries.
|
|
"""
|
|
return [e for e in self if 'fuzzy' in e.flags]
|
|
|
|
def obsolete_entries(self):
|
|
"""
|
|
Convenience method that returns the list of obsolete entries.
|
|
"""
|
|
return [e for e in self if e.obsolete]
|
|
|
|
def merge(self, refpot):
|
|
"""
|
|
Convenience method that merges the current pofile with the pot file
|
|
provided. It behaves exactly as the gettext msgmerge utility:
|
|
|
|
* comments of this file will be preserved, but extracted comments and
|
|
occurrences will be discarded;
|
|
* any translations or comments in the file will be discarded, however,
|
|
dot comments and file positions will be preserved;
|
|
* the fuzzy flags are preserved.
|
|
|
|
Keyword argument:
|
|
|
|
``refpot``
|
|
object POFile, the reference catalog.
|
|
"""
|
|
# Store entries in dict/set for faster access
|
|
self_entries = dict((entry.msgid, entry) for entry in self)
|
|
refpot_msgids = set(entry.msgid for entry in refpot)
|
|
# Merge entries that are in the refpot
|
|
for entry in refpot:
|
|
e = self_entries.get(entry.msgid)
|
|
if e is None:
|
|
e = POEntry()
|
|
self.append(e)
|
|
e.merge(entry)
|
|
# ok, now we must "obsolete" entries that are not in the refpot anymore
|
|
for entry in self:
|
|
if entry.msgid not in refpot_msgids:
|
|
entry.obsolete = True
|
|
# }}}
|
|
# class MOFile {{{
|
|
|
|
|
|
class MOFile(_BaseFile):
|
|
"""
|
|
Mo file reader/writer.
|
|
This class inherits the :class:`~polib._BaseFile` class and, by
|
|
extension, the python ``list`` type.
|
|
"""
|
|
MAGIC = 0x950412de
|
|
MAGIC_SWAPPED = 0xde120495
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
"""
|
|
Constructor, accepts all keywords arguments accepted by
|
|
:class:`~polib._BaseFile` class.
|
|
"""
|
|
_BaseFile.__init__(self, *args, **kwargs)
|
|
self.magic_number = None
|
|
self.version = 0
|
|
|
|
def save_as_pofile(self, fpath):
|
|
"""
|
|
Saves the mofile as a pofile to ``fpath``.
|
|
|
|
Keyword argument:
|
|
|
|
``fpath``
|
|
string, full or relative path to the file.
|
|
"""
|
|
_BaseFile.save(self, fpath)
|
|
|
|
def save(self, fpath=None):
|
|
"""
|
|
Saves the mofile to ``fpath``.
|
|
|
|
Keyword argument:
|
|
|
|
``fpath``
|
|
string, full or relative path to the file.
|
|
"""
|
|
_BaseFile.save(self, fpath, 'to_binary')
|
|
|
|
def percent_translated(self):
|
|
"""
|
|
Convenience method to keep the same interface with POFile instances.
|
|
"""
|
|
return 100
|
|
|
|
def translated_entries(self):
|
|
"""
|
|
Convenience method to keep the same interface with POFile instances.
|
|
"""
|
|
return self
|
|
|
|
def untranslated_entries(self):
|
|
"""
|
|
Convenience method to keep the same interface with POFile instances.
|
|
"""
|
|
return []
|
|
|
|
def fuzzy_entries(self):
|
|
"""
|
|
Convenience method to keep the same interface with POFile instances.
|
|
"""
|
|
return []
|
|
|
|
def obsolete_entries(self):
|
|
"""
|
|
Convenience method to keep the same interface with POFile instances.
|
|
"""
|
|
return []
|
|
# }}}
|
|
# class _BaseEntry {{{
|
|
|
|
|
|
class _BaseEntry(object):
|
|
"""
|
|
Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
|
|
This class should **not** be instantiated directly.
|
|
"""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
"""
|
|
Constructor, accepts the following keyword arguments:
|
|
|
|
``msgid``
|
|
string, the entry msgid.
|
|
|
|
``msgstr``
|
|
string, the entry msgstr.
|
|
|
|
``msgid_plural``
|
|
string, the entry msgid_plural.
|
|
|
|
``msgstr_plural``
|
|
list, the entry msgstr_plural lines.
|
|
|
|
``msgctxt``
|
|
string, the entry context (msgctxt).
|
|
|
|
``obsolete``
|
|
bool, whether the entry is "obsolete" or not.
|
|
|
|
``encoding``
|
|
string, the encoding to use, defaults to ``default_encoding``
|
|
global variable (optional).
|
|
"""
|
|
self.msgid = kwargs.get('msgid', '')
|
|
self.msgstr = kwargs.get('msgstr', '')
|
|
self.msgid_plural = kwargs.get('msgid_plural', '')
|
|
self.msgstr_plural = kwargs.get('msgstr_plural', {})
|
|
self.msgctxt = kwargs.get('msgctxt', None)
|
|
self.obsolete = kwargs.get('obsolete', False)
|
|
self.encoding = kwargs.get('encoding', default_encoding)
|
|
|
|
def __unicode__(self, wrapwidth=78):
|
|
"""
|
|
Returns the unicode representation of the entry.
|
|
"""
|
|
if self.obsolete:
|
|
delflag = '#~ '
|
|
else:
|
|
delflag = ''
|
|
ret = []
|
|
# write the msgctxt if any
|
|
if self.msgctxt is not None:
|
|
ret += self._str_field("msgctxt", delflag, "", self.msgctxt,
|
|
wrapwidth)
|
|
# write the msgid
|
|
ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
|
|
# write the msgid_plural if any
|
|
if self.msgid_plural:
|
|
ret += self._str_field("msgid_plural", delflag, "",
|
|
self.msgid_plural, wrapwidth)
|
|
if self.msgstr_plural:
|
|
# write the msgstr_plural if any
|
|
msgstrs = self.msgstr_plural
|
|
keys = list(msgstrs)
|
|
keys.sort()
|
|
for index in keys:
|
|
msgstr = msgstrs[index]
|
|
plural_index = '[%s]' % index
|
|
ret += self._str_field("msgstr", delflag, plural_index, msgstr,
|
|
wrapwidth)
|
|
else:
|
|
# otherwise write the msgstr
|
|
ret += self._str_field("msgstr", delflag, "", self.msgstr,
|
|
wrapwidth)
|
|
ret.append('')
|
|
usedirect = True
|
|
if not PY3 and type(ret[0] != unicode):
|
|
try:
|
|
usedirect = False
|
|
ret = u('\n').join(x.decode('utf-8') for x in ret)
|
|
except Exception:
|
|
usedirect = True
|
|
if usedirect:
|
|
ret = u('\n').join(ret)
|
|
return ret
|
|
|
|
if PY3:
|
|
def __str__(self):
|
|
return self.__unicode__()
|
|
else:
|
|
def __str__(self):
|
|
"""
|
|
Returns the string representation of the entry.
|
|
"""
|
|
return unicode(self).encode(self.encoding)
|
|
|
|
def __eq__(self, other):
|
|
return str(self) == str(other)
|
|
|
|
def _str_field(self, fieldname, delflag, plural_index, field,
|
|
wrapwidth=78):
|
|
lines = field.splitlines(True)
|
|
if len(lines) > 1:
|
|
lines = [''] + lines # start with initial empty line
|
|
else:
|
|
escaped_field = escape(field)
|
|
specialchars_count = 0
|
|
for c in ['\\', '\n', '\r', '\t', '"']:
|
|
specialchars_count += field.count(c)
|
|
# comparison must take into account fieldname length + one space
|
|
# + 2 quotes (eg. msgid "<string>")
|
|
flength = len(fieldname) + 3
|
|
if plural_index:
|
|
flength += len(plural_index)
|
|
real_wrapwidth = wrapwidth - flength + specialchars_count
|
|
if wrapwidth > 0 and len(field) > real_wrapwidth:
|
|
# Wrap the line but take field name into account
|
|
lines = [''] + [unescape(item) for item in wrap(
|
|
escaped_field,
|
|
wrapwidth - 2, # 2 for quotes ""
|
|
drop_whitespace=False,
|
|
break_long_words=False
|
|
)]
|
|
else:
|
|
lines = [field]
|
|
if fieldname.startswith('previous_'):
|
|
# quick and dirty trick to get the real field name
|
|
fieldname = fieldname[9:]
|
|
|
|
ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
|
|
escape(lines.pop(0)))]
|
|
for line in lines:
|
|
ret.append('%s"%s"' % (delflag, escape(line)))
|
|
return ret
|
|
# }}}
|
|
# class POEntry {{{
|
|
|
|
|
|
class POEntry(_BaseEntry):
|
|
"""
|
|
Represents a po file entry.
|
|
"""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
"""
|
|
Constructor, accepts the following keyword arguments:
|
|
|
|
``comment``
|
|
string, the entry comment.
|
|
|
|
``tcomment``
|
|
string, the entry translator comment.
|
|
|
|
``occurrences``
|
|
list, the entry occurrences.
|
|
|
|
``flags``
|
|
list, the entry flags.
|
|
|
|
``previous_msgctxt``
|
|
string, the entry previous context.
|
|
|
|
``previous_msgid``
|
|
string, the entry previous msgid.
|
|
|
|
``previous_msgid_plural``
|
|
string, the entry previous msgid_plural.
|
|
|
|
``linenum``
|
|
integer, the line number of the entry
|
|
"""
|
|
_BaseEntry.__init__(self, *args, **kwargs)
|
|
self.comment = kwargs.get('comment', '')
|
|
self.tcomment = kwargs.get('tcomment', '')
|
|
self.occurrences = kwargs.get('occurrences', [])
|
|
self.flags = kwargs.get('flags', [])
|
|
self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
|
|
self.previous_msgid = kwargs.get('previous_msgid', None)
|
|
self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
|
|
self.linenum = kwargs.get('linenum', None)
|
|
|
|
def __unicode__(self, wrapwidth=0):
|
|
"""
|
|
Returns the unicode representation of the entry.
|
|
"""
|
|
ret = []
|
|
# comments first, if any (with text wrapping as xgettext does)
|
|
if self.obsolete:
|
|
comments = [('tcomment', '# ')]
|
|
else:
|
|
comments = [('comment', '#. '), ('tcomment', '# ')]
|
|
for c in comments:
|
|
val = getattr(self, c[0])
|
|
if val:
|
|
for comment in val.split('\n'):
|
|
if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth:
|
|
ret += wrap(
|
|
comment,
|
|
wrapwidth,
|
|
initial_indent=c[1],
|
|
subsequent_indent=c[1],
|
|
break_long_words=False
|
|
)
|
|
else:
|
|
ret.append('%s%s' % (c[1], comment))
|
|
|
|
# occurrences (with text wrapping as xgettext does)
|
|
if not self.obsolete and self.occurrences:
|
|
filelist = []
|
|
for fpath, lineno in self.occurrences:
|
|
if lineno:
|
|
filelist.append('%s:%s' % (fpath, lineno))
|
|
else:
|
|
filelist.append(fpath)
|
|
filestr = ' '.join(filelist)
|
|
if wrapwidth > 0 and len(filestr) + 3 > wrapwidth:
|
|
# textwrap split words that contain hyphen, this is not
|
|
# what we want for filenames, so the dirty hack is to
|
|
# temporally replace hyphens with a char that a file cannot
|
|
# contain, like "*"
|
|
ret += [l.replace('*', '-') for l in wrap(
|
|
filestr.replace('-', '*'),
|
|
wrapwidth,
|
|
initial_indent='#: ',
|
|
subsequent_indent='#: ',
|
|
break_long_words=False
|
|
)]
|
|
else:
|
|
ret.append('#: ' + filestr)
|
|
|
|
# flags (TODO: wrapping ?)
|
|
if self.flags:
|
|
ret.append('#, %s' % ', '.join(self.flags))
|
|
|
|
# previous context and previous msgid/msgid_plural
|
|
fields = ['previous_msgctxt', 'previous_msgid',
|
|
'previous_msgid_plural']
|
|
if self.obsolete:
|
|
prefix = "#~| "
|
|
else:
|
|
prefix = "#| "
|
|
for f in fields:
|
|
val = getattr(self, f)
|
|
if val:
|
|
ret += self._str_field(f, prefix, "", val, wrapwidth)
|
|
|
|
ret.append(_BaseEntry.__unicode__(self, wrapwidth))
|
|
ret = u('\n').join(ret)
|
|
return ret
|
|
|
|
def __cmp__(self, other):
|
|
"""
|
|
Called by comparison operations if rich comparison is not defined.
|
|
"""
|
|
|
|
# First: Obsolete test
|
|
if self.obsolete != other.obsolete:
|
|
if self.obsolete:
|
|
return -1
|
|
else:
|
|
return 1
|
|
# Work on a copy to protect original
|
|
occ1 = sorted(self.occurrences[:])
|
|
occ2 = sorted(other.occurrences[:])
|
|
pos = 0
|
|
for entry1 in occ1:
|
|
try:
|
|
entry2 = occ2[pos]
|
|
except IndexError:
|
|
return 1
|
|
pos = pos + 1
|
|
if entry1[0] != entry2[0]:
|
|
if entry1[0] > entry2[0]:
|
|
return 1
|
|
else:
|
|
return -1
|
|
if entry1[1] != entry2[1]:
|
|
if entry1[1] > entry2[1]:
|
|
return 1
|
|
else:
|
|
return -1
|
|
# Compare msgid_plural if set
|
|
if self.msgid_plural:
|
|
if not other.msgid_plural:
|
|
return 1
|
|
for pos in self.msgid_plural:
|
|
if pos not in other.msgid_plural:
|
|
return 1
|
|
if self.msgid_plural[pos] > other.msgid_plural[pos]:
|
|
return 1
|
|
if self.msgid_plural[pos] < other.msgid_plural[pos]:
|
|
return -1
|
|
# Finally: Compare message ID
|
|
if self.msgid > other.msgid:
|
|
return 1
|
|
elif self.msgid < other.msgid:
|
|
return -1
|
|
return 0
|
|
|
|
def __gt__(self, other):
|
|
return self.__cmp__(other) > 0
|
|
|
|
def __lt__(self, other):
|
|
return self.__cmp__(other) < 0
|
|
|
|
def __ge__(self, other):
|
|
return self.__cmp__(other) >= 0
|
|
|
|
def __le__(self, other):
|
|
return self.__cmp__(other) <= 0
|
|
|
|
def __eq__(self, other):
|
|
return self.__cmp__(other) == 0
|
|
|
|
def __ne__(self, other):
|
|
return self.__cmp__(other) != 0
|
|
|
|
def translated(self):
|
|
"""
|
|
Returns ``True`` if the entry has been translated or ``False``
|
|
otherwise.
|
|
"""
|
|
if self.obsolete or 'fuzzy' in self.flags:
|
|
return False
|
|
if self.msgstr != '':
|
|
return True
|
|
if self.msgstr_plural:
|
|
for pos in self.msgstr_plural:
|
|
if self.msgstr_plural[pos] == '':
|
|
return False
|
|
return True
|
|
return False
|
|
|
|
def merge(self, other):
|
|
"""
|
|
Merge the current entry with the given pot entry.
|
|
"""
|
|
self.msgid = other.msgid
|
|
self.msgctxt = other.msgctxt
|
|
self.occurrences = other.occurrences
|
|
self.comment = other.comment
|
|
fuzzy = 'fuzzy' in self.flags
|
|
self.flags = other.flags[:] # clone flags
|
|
if fuzzy:
|
|
self.flags.append('fuzzy')
|
|
self.msgid_plural = other.msgid_plural
|
|
self.obsolete = other.obsolete
|
|
self.previous_msgctxt = other.previous_msgctxt
|
|
self.previous_msgid = other.previous_msgid
|
|
self.previous_msgid_plural = other.previous_msgid_plural
|
|
if other.msgstr_plural:
|
|
for pos in other.msgstr_plural:
|
|
try:
|
|
# keep existing translation at pos if any
|
|
self.msgstr_plural[pos]
|
|
except KeyError:
|
|
self.msgstr_plural[pos] = ''
|
|
|
|
def __hash__(self):
|
|
return hash((self.msgid, self.msgstr))
|
|
# }}}
|
|
# class MOEntry {{{
|
|
|
|
|
|
class MOEntry(_BaseEntry):
|
|
"""
|
|
Represents a mo file entry.
|
|
"""
|
|
def __init__(self, *args, **kwargs):
|
|
"""
|
|
Constructor, accepts the following keyword arguments,
|
|
for consistency with :class:`~polib.POEntry`:
|
|
|
|
``comment``
|
|
``tcomment``
|
|
``occurrences``
|
|
``flags``
|
|
``previous_msgctxt``
|
|
``previous_msgid``
|
|
``previous_msgid_plural``
|
|
|
|
Note: even though these keyword arguments are accepted,
|
|
they hold no real meaning in the context of MO files
|
|
and are simply ignored.
|
|
"""
|
|
_BaseEntry.__init__(self, *args, **kwargs)
|
|
self.comment = ''
|
|
self.tcomment = ''
|
|
self.occurrences = []
|
|
self.flags = []
|
|
self.previous_msgctxt = None
|
|
self.previous_msgid = None
|
|
self.previous_msgid_plural = None
|
|
|
|
def __hash__(self):
|
|
return hash((self.msgid, self.msgstr))
|
|
|
|
# }}}
|
|
# class _POFileParser {{{
|
|
|
|
|
|
class _POFileParser(object):
|
|
"""
|
|
A finite state machine to parse efficiently and correctly po
|
|
file format.
|
|
"""
|
|
|
|
def __init__(self, pofile, *args, **kwargs):
|
|
"""
|
|
Constructor.
|
|
|
|
Keyword arguments:
|
|
|
|
``pofile``
|
|
string, path to the po file or its content
|
|
|
|
``encoding``
|
|
string, the encoding to use, defaults to ``default_encoding``
|
|
global variable (optional).
|
|
|
|
``check_for_duplicates``
|
|
whether to check for duplicate entries when adding entries to the
|
|
file (optional, default: ``False``).
|
|
"""
|
|
enc = kwargs.get('encoding', default_encoding)
|
|
if _is_file(pofile):
|
|
try:
|
|
self.fhandle = io.open(pofile, 'rt', encoding=enc)
|
|
except LookupError:
|
|
enc = default_encoding
|
|
self.fhandle = io.open(pofile, 'rt', encoding=enc)
|
|
else:
|
|
self.fhandle = pofile.splitlines()
|
|
|
|
klass = kwargs.get('klass')
|
|
if klass is None:
|
|
klass = POFile
|
|
self.instance = klass(
|
|
pofile=pofile,
|
|
encoding=enc,
|
|
check_for_duplicates=kwargs.get('check_for_duplicates', False)
|
|
)
|
|
self.transitions = {}
|
|
self.current_line = 0
|
|
self.current_entry = POEntry(linenum=self.current_line)
|
|
self.current_state = 'st'
|
|
self.current_token = None
|
|
# two memo flags used in handlers
|
|
self.msgstr_index = 0
|
|
self.entry_obsolete = 0
|
|
# Configure the state machine, by adding transitions.
|
|
# Signification of symbols:
|
|
# * ST: Beginning of the file (start)
|
|
# * HE: Header
|
|
# * TC: a translation comment
|
|
# * GC: a generated comment
|
|
# * OC: a file/line occurrence
|
|
# * FL: a flags line
|
|
# * CT: a message context
|
|
# * PC: a previous msgctxt
|
|
# * PM: a previous msgid
|
|
# * PP: a previous msgid_plural
|
|
# * MI: a msgid
|
|
# * MP: a msgid plural
|
|
# * MS: a msgstr
|
|
# * MX: a msgstr plural
|
|
# * MC: a msgid or msgstr continuation line
|
|
all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc',
|
|
'ms', 'mp', 'mx', 'mi']
|
|
|
|
self.add('tc', ['st', 'he'], 'he')
|
|
self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms',
|
|
'mp', 'mx', 'mi'], 'tc')
|
|
self.add('gc', all, 'gc')
|
|
self.add('oc', all, 'oc')
|
|
self.add('fl', all, 'fl')
|
|
self.add('pc', all, 'pc')
|
|
self.add('pm', all, 'pm')
|
|
self.add('pp', all, 'pp')
|
|
self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm',
|
|
'pp', 'ms', 'mx'], 'ct')
|
|
self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc',
|
|
'pm', 'pp', 'ms', 'mx'], 'mi')
|
|
self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
|
|
self.add('ms', ['mi', 'mp', 'tc'], 'ms')
|
|
self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
|
|
self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
|
|
|
|
def parse(self):
|
|
"""
|
|
Run the state machine, parse the file line by line and call process()
|
|
with the current matched symbol.
|
|
"""
|
|
|
|
keywords = {
|
|
'msgctxt': 'ct',
|
|
'msgid': 'mi',
|
|
'msgstr': 'ms',
|
|
'msgid_plural': 'mp',
|
|
}
|
|
prev_keywords = {
|
|
'msgid_plural': 'pp',
|
|
'msgid': 'pm',
|
|
'msgctxt': 'pc',
|
|
}
|
|
tokens = []
|
|
for line in self.fhandle:
|
|
self.current_line += 1
|
|
line = line.strip()
|
|
if line == '':
|
|
continue
|
|
|
|
tokens = line.split(None, 2)
|
|
nb_tokens = len(tokens)
|
|
|
|
if tokens[0] == '#~|':
|
|
continue
|
|
|
|
if tokens[0] == '#~' and nb_tokens > 1:
|
|
line = line[3:].strip()
|
|
tokens = tokens[1:]
|
|
nb_tokens -= 1
|
|
self.entry_obsolete = 1
|
|
else:
|
|
self.entry_obsolete = 0
|
|
|
|
# Take care of keywords like
|
|
# msgid, msgid_plural, msgctxt & msgstr.
|
|
if tokens[0] in keywords and nb_tokens > 1:
|
|
line = line[len(tokens[0]):].lstrip()
|
|
if re.search(r'([^\\]|^)"', line[1:-1]):
|
|
raise IOError('Syntax error in po file %s (line %s): '
|
|
'unescaped double quote found' %
|
|
(self.instance.fpath, self.current_line))
|
|
self.current_token = line
|
|
self.process(keywords[tokens[0]])
|
|
continue
|
|
|
|
self.current_token = line
|
|
|
|
if tokens[0] == '#:':
|
|
if nb_tokens <= 1:
|
|
continue
|
|
# we are on an occurrences line
|
|
self.process('oc')
|
|
|
|
elif line[:1] == '"':
|
|
# we are on a continuation line
|
|
if re.search(r'([^\\]|^)"', line[1:-1]):
|
|
raise IOError('Syntax error in po file %s (line %s): '
|
|
'unescaped double quote found' %
|
|
(self.instance.fpath, self.current_line))
|
|
self.process('mc')
|
|
|
|
elif line[:7] == 'msgstr[':
|
|
# we are on a msgstr plural
|
|
self.process('mx')
|
|
|
|
elif tokens[0] == '#,':
|
|
if nb_tokens <= 1:
|
|
continue
|
|
# we are on a flags line
|
|
self.process('fl')
|
|
|
|
elif tokens[0] == '#' or tokens[0].startswith('##'):
|
|
if line == '#':
|
|
line += ' '
|
|
# we are on a translator comment line
|
|
self.process('tc')
|
|
|
|
elif tokens[0] == '#.':
|
|
if nb_tokens <= 1:
|
|
continue
|
|
# we are on a generated comment line
|
|
self.process('gc')
|
|
|
|
elif tokens[0] == '#|':
|
|
if nb_tokens <= 1:
|
|
raise IOError('Syntax error in po file %s (line %s)' %
|
|
(self.instance.fpath, self.current_line))
|
|
|
|
# Remove the marker and any whitespace right after that.
|
|
line = line[2:].lstrip()
|
|
self.current_token = line
|
|
|
|
if tokens[1].startswith('"'):
|
|
# Continuation of previous metadata.
|
|
self.process('mc')
|
|
continue
|
|
|
|
if nb_tokens == 2:
|
|
# Invalid continuation line.
|
|
raise IOError('Syntax error in po file %s (line %s): '
|
|
'invalid continuation line' %
|
|
(self.instance.fpath, self.current_line))
|
|
|
|
# we are on a "previous translation" comment line,
|
|
if tokens[1] not in prev_keywords:
|
|
# Unknown keyword in previous translation comment.
|
|
raise IOError('Syntax error in po file %s (line %s): '
|
|
'unknown keyword %s' %
|
|
(self.instance.fpath, self.current_line,
|
|
tokens[1]))
|
|
|
|
# Remove the keyword and any whitespace
|
|
# between it and the starting quote.
|
|
line = line[len(tokens[1]):].lstrip()
|
|
self.current_token = line
|
|
self.process(prev_keywords[tokens[1]])
|
|
|
|
else:
|
|
raise IOError('Syntax error in po file %s (line %s)' %
|
|
(self.instance.fpath, self.current_line))
|
|
|
|
if self.current_entry and len(tokens) > 0 and \
|
|
not tokens[0].startswith('#'):
|
|
# since entries are added when another entry is found, we must add
|
|
# the last entry here (only if there are lines). Trailing comments
|
|
# are ignored
|
|
self.instance.append(self.current_entry)
|
|
|
|
# before returning the instance, check if there's metadata and if
|
|
# so extract it in a dict
|
|
metadataentry = self.instance.find('')
|
|
if metadataentry: # metadata found
|
|
# remove the entry
|
|
self.instance.remove(metadataentry)
|
|
self.instance.metadata_is_fuzzy = metadataentry.flags
|
|
key = None
|
|
for msg in metadataentry.msgstr.splitlines():
|
|
try:
|
|
key, val = msg.split(':', 1)
|
|
self.instance.metadata[key] = val.strip()
|
|
except (ValueError, KeyError):
|
|
if key is not None:
|
|
self.instance.metadata[key] += '\n' + msg.strip()
|
|
# close opened file
|
|
if not isinstance(self.fhandle, list): # must be file
|
|
self.fhandle.close()
|
|
return self.instance
|
|
|
|
def add(self, symbol, states, next_state):
|
|
"""
|
|
Add a transition to the state machine.
|
|
|
|
Keywords arguments:
|
|
|
|
``symbol``
|
|
string, the matched token (two chars symbol).
|
|
|
|
``states``
|
|
list, a list of states (two chars symbols).
|
|
|
|
``next_state``
|
|
the next state the fsm will have after the action.
|
|
"""
|
|
for state in states:
|
|
action = getattr(self, 'handle_%s' % next_state)
|
|
self.transitions[(symbol, state)] = (action, next_state)
|
|
|
|
def process(self, symbol):
|
|
"""
|
|
Process the transition corresponding to the current state and the
|
|
symbol provided.
|
|
|
|
Keywords arguments:
|
|
|
|
``symbol``
|
|
string, the matched token (two chars symbol).
|
|
|
|
``linenum``
|
|
integer, the current line number of the parsed file.
|
|
"""
|
|
try:
|
|
(action, state) = self.transitions[(symbol, self.current_state)]
|
|
if action():
|
|
self.current_state = state
|
|
except Exception:
|
|
raise IOError('Syntax error in po file (line %s)' %
|
|
self.current_line)
|
|
|
|
# state handlers
|
|
|
|
def handle_he(self):
|
|
"""Handle a header comment."""
|
|
if self.instance.header != '':
|
|
self.instance.header += '\n'
|
|
self.instance.header += self.current_token[2:]
|
|
return 1
|
|
|
|
def handle_tc(self):
|
|
"""Handle a translator comment."""
|
|
if self.current_state in ['mc', 'ms', 'mx']:
|
|
self.instance.append(self.current_entry)
|
|
self.current_entry = POEntry(linenum=self.current_line)
|
|
if self.current_entry.tcomment != '':
|
|
self.current_entry.tcomment += '\n'
|
|
tcomment = self.current_token.lstrip('#')
|
|
if tcomment.startswith(' '):
|
|
tcomment = tcomment[1:]
|
|
self.current_entry.tcomment += tcomment
|
|
return True
|
|
|
|
def handle_gc(self):
|
|
"""Handle a generated comment."""
|
|
if self.current_state in ['mc', 'ms', 'mx']:
|
|
self.instance.append(self.current_entry)
|
|
self.current_entry = POEntry(linenum=self.current_line)
|
|
if self.current_entry.comment != '':
|
|
self.current_entry.comment += '\n'
|
|
self.current_entry.comment += self.current_token[3:]
|
|
return True
|
|
|
|
def handle_oc(self):
|
|
"""Handle a file:num occurrence."""
|
|
if self.current_state in ['mc', 'ms', 'mx']:
|
|
self.instance.append(self.current_entry)
|
|
self.current_entry = POEntry(linenum=self.current_line)
|
|
occurrences = self.current_token[3:].split()
|
|
for occurrence in occurrences:
|
|
if occurrence != '':
|
|
try:
|
|
fil, line = occurrence.rsplit(':', 1)
|
|
if not line.isdigit():
|
|
fil = fil + line
|
|
line = ''
|
|
self.current_entry.occurrences.append((fil, line))
|
|
except (ValueError, AttributeError):
|
|
self.current_entry.occurrences.append((occurrence, ''))
|
|
return True
|
|
|
|
def handle_fl(self):
|
|
"""Handle a flags line."""
|
|
if self.current_state in ['mc', 'ms', 'mx']:
|
|
self.instance.append(self.current_entry)
|
|
self.current_entry = POEntry(linenum=self.current_line)
|
|
self.current_entry.flags += [c.strip() for c in
|
|
self.current_token[3:].split(',')]
|
|
return True
|
|
|
|
def handle_pp(self):
|
|
"""Handle a previous msgid_plural line."""
|
|
if self.current_state in ['mc', 'ms', 'mx']:
|
|
self.instance.append(self.current_entry)
|
|
self.current_entry = POEntry(linenum=self.current_line)
|
|
self.current_entry.previous_msgid_plural = \
|
|
unescape(self.current_token[1:-1])
|
|
return True
|
|
|
|
def handle_pm(self):
|
|
"""Handle a previous msgid line."""
|
|
if self.current_state in ['mc', 'ms', 'mx']:
|
|
self.instance.append(self.current_entry)
|
|
self.current_entry = POEntry(linenum=self.current_line)
|
|
self.current_entry.previous_msgid = \
|
|
unescape(self.current_token[1:-1])
|
|
return True
|
|
|
|
def handle_pc(self):
|
|
"""Handle a previous msgctxt line."""
|
|
if self.current_state in ['mc', 'ms', 'mx']:
|
|
self.instance.append(self.current_entry)
|
|
self.current_entry = POEntry(linenum=self.current_line)
|
|
self.current_entry.previous_msgctxt = \
|
|
unescape(self.current_token[1:-1])
|
|
return True
|
|
|
|
def handle_ct(self):
|
|
"""Handle a msgctxt."""
|
|
if self.current_state in ['mc', 'ms', 'mx']:
|
|
self.instance.append(self.current_entry)
|
|
self.current_entry = POEntry(linenum=self.current_line)
|
|
self.current_entry.msgctxt = unescape(self.current_token[1:-1])
|
|
return True
|
|
|
|
def handle_mi(self):
|
|
"""Handle a msgid."""
|
|
if self.current_state in ['mc', 'ms', 'mx']:
|
|
self.instance.append(self.current_entry)
|
|
self.current_entry = POEntry(linenum=self.current_line)
|
|
self.current_entry.obsolete = self.entry_obsolete
|
|
self.current_entry.msgid = unescape(self.current_token[1:-1])
|
|
return True
|
|
|
|
def handle_mp(self):
|
|
"""Handle a msgid plural."""
|
|
self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
|
|
return True
|
|
|
|
def handle_ms(self):
|
|
"""Handle a msgstr."""
|
|
self.current_entry.msgstr = unescape(self.current_token[1:-1])
|
|
return True
|
|
|
|
def handle_mx(self):
|
|
"""Handle a msgstr plural."""
|
|
index = self.current_token[7]
|
|
value = self.current_token[self.current_token.find('"') + 1:-1]
|
|
self.current_entry.msgstr_plural[int(index)] = unescape(value)
|
|
self.msgstr_index = int(index)
|
|
return True
|
|
|
|
def handle_mc(self):
|
|
"""Handle a msgid or msgstr continuation line."""
|
|
token = unescape(self.current_token[1:-1])
|
|
if self.current_state == 'ct':
|
|
self.current_entry.msgctxt += token
|
|
elif self.current_state == 'mi':
|
|
self.current_entry.msgid += token
|
|
elif self.current_state == 'mp':
|
|
self.current_entry.msgid_plural += token
|
|
elif self.current_state == 'ms':
|
|
self.current_entry.msgstr += token
|
|
elif self.current_state == 'mx':
|
|
self.current_entry.msgstr_plural[self.msgstr_index] += token
|
|
elif self.current_state == 'pp':
|
|
self.current_entry.previous_msgid_plural += token
|
|
elif self.current_state == 'pm':
|
|
self.current_entry.previous_msgid += token
|
|
elif self.current_state == 'pc':
|
|
self.current_entry.previous_msgctxt += token
|
|
# don't change the current state
|
|
return False
|
|
# }}}
|
|
# class _MOFileParser {{{
|
|
|
|
|
|
class _MOFileParser(object):
|
|
"""
|
|
A class to parse binary mo files.
|
|
"""
|
|
|
|
def __init__(self, mofile, *args, **kwargs):
|
|
"""
|
|
Constructor.
|
|
|
|
Keyword arguments:
|
|
|
|
``mofile``
|
|
string, path to the mo file or its content
|
|
|
|
``encoding``
|
|
string, the encoding to use, defaults to ``default_encoding``
|
|
global variable (optional).
|
|
|
|
``check_for_duplicates``
|
|
whether to check for duplicate entries when adding entries to the
|
|
file (optional, default: ``False``).
|
|
"""
|
|
self.fhandle = open(mofile, 'rb')
|
|
|
|
klass = kwargs.get('klass')
|
|
if klass is None:
|
|
klass = MOFile
|
|
self.instance = klass(
|
|
fpath=mofile,
|
|
encoding=kwargs.get('encoding', default_encoding),
|
|
check_for_duplicates=kwargs.get('check_for_duplicates', False)
|
|
)
|
|
|
|
def __del__(self):
|
|
"""
|
|
Make sure the file is closed, this prevents warnings on unclosed file
|
|
when running tests with python >= 3.2.
|
|
"""
|
|
if self.fhandle:
|
|
self.fhandle.close()
|
|
|
|
def parse(self):
|
|
"""
|
|
Build the instance with the file handle provided in the
|
|
constructor.
|
|
"""
|
|
# parse magic number
|
|
magic_number = self._readbinary('<I', 4)
|
|
if magic_number == MOFile.MAGIC:
|
|
ii = '<II'
|
|
elif magic_number == MOFile.MAGIC_SWAPPED:
|
|
ii = '>II'
|
|
else:
|
|
raise IOError('Invalid mo file, magic number is incorrect !')
|
|
self.instance.magic_number = magic_number
|
|
# parse the version number and the number of strings
|
|
version, numofstrings = self._readbinary(ii, 8)
|
|
# from MO file format specs: "A program seeing an unexpected major
|
|
# revision number should stop reading the MO file entirely"
|
|
if version not in (0, 1):
|
|
raise IOError('Invalid mo file, unexpected major revision number')
|
|
self.instance.version = version
|
|
# original strings and translation strings hash table offset
|
|
msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
|
|
# move to msgid hash table and read length and offset of msgids
|
|
self.fhandle.seek(msgids_hash_offset)
|
|
msgids_index = []
|
|
for i in range(numofstrings):
|
|
msgids_index.append(self._readbinary(ii, 8))
|
|
# move to msgstr hash table and read length and offset of msgstrs
|
|
self.fhandle.seek(msgstrs_hash_offset)
|
|
msgstrs_index = []
|
|
for i in range(numofstrings):
|
|
msgstrs_index.append(self._readbinary(ii, 8))
|
|
# build entries
|
|
encoding = self.instance.encoding
|
|
for i in range(numofstrings):
|
|
self.fhandle.seek(msgids_index[i][1])
|
|
msgid = self.fhandle.read(msgids_index[i][0])
|
|
|
|
self.fhandle.seek(msgstrs_index[i][1])
|
|
msgstr = self.fhandle.read(msgstrs_index[i][0])
|
|
if i == 0 and not msgid: # metadata
|
|
raw_metadata, metadata = msgstr.split(b('\n')), {}
|
|
for line in raw_metadata:
|
|
tokens = line.split(b(':'), 1)
|
|
if tokens[0] != b(''):
|
|
try:
|
|
k = tokens[0].decode(encoding)
|
|
v = tokens[1].decode(encoding)
|
|
metadata[k] = v.strip()
|
|
except IndexError:
|
|
metadata[k] = u('')
|
|
self.instance.metadata = metadata
|
|
continue
|
|
# test if we have a plural entry
|
|
msgid_tokens = msgid.split(b('\0'))
|
|
if len(msgid_tokens) > 1:
|
|
entry = self._build_entry(
|
|
msgid=msgid_tokens[0],
|
|
msgid_plural=msgid_tokens[1],
|
|
msgstr_plural=dict((k, v) for k, v in
|
|
enumerate(msgstr.split(b('\0'))))
|
|
)
|
|
else:
|
|
entry = self._build_entry(msgid=msgid, msgstr=msgstr)
|
|
self.instance.append(entry)
|
|
# close opened file
|
|
self.fhandle.close()
|
|
return self.instance
|
|
|
|
def _build_entry(self, msgid, msgstr=None, msgid_plural=None,
|
|
msgstr_plural=None):
|
|
msgctxt_msgid = msgid.split(b('\x04'))
|
|
encoding = self.instance.encoding
|
|
if len(msgctxt_msgid) > 1:
|
|
kwargs = {
|
|
'msgctxt': msgctxt_msgid[0].decode(encoding),
|
|
'msgid': msgctxt_msgid[1].decode(encoding),
|
|
}
|
|
else:
|
|
kwargs = {'msgid': msgid.decode(encoding)}
|
|
if msgstr:
|
|
kwargs['msgstr'] = msgstr.decode(encoding)
|
|
if msgid_plural:
|
|
kwargs['msgid_plural'] = msgid_plural.decode(encoding)
|
|
if msgstr_plural:
|
|
for k in msgstr_plural:
|
|
msgstr_plural[k] = msgstr_plural[k].decode(encoding)
|
|
kwargs['msgstr_plural'] = msgstr_plural
|
|
return MOEntry(**kwargs)
|
|
|
|
def _readbinary(self, fmt, numbytes):
|
|
"""
|
|
Private method that unpack n bytes of data using format <fmt>.
|
|
It returns a tuple or a mixed value if the tuple length is 1.
|
|
"""
|
|
bytes = self.fhandle.read(numbytes)
|
|
tup = struct.unpack(fmt, bytes)
|
|
if len(tup) == 1:
|
|
return tup[0]
|
|
return tup
|
|
# }}}
|
|
# class TextWrapper {{{
|
|
|
|
|
|
class TextWrapper(textwrap.TextWrapper):
|
|
"""
|
|
Subclass of textwrap.TextWrapper that backport the
|
|
drop_whitespace option.
|
|
"""
|
|
def __init__(self, *args, **kwargs):
|
|
drop_whitespace = kwargs.pop('drop_whitespace', True)
|
|
textwrap.TextWrapper.__init__(self, *args, **kwargs)
|
|
self.drop_whitespace = drop_whitespace
|
|
|
|
def _wrap_chunks(self, chunks):
|
|
"""_wrap_chunks(chunks : [string]) -> [string]
|
|
|
|
Wrap a sequence of text chunks and return a list of lines of
|
|
length 'self.width' or less. (If 'break_long_words' is false,
|
|
some lines may be longer than this.) Chunks correspond roughly
|
|
to words and the whitespace between them: each chunk is
|
|
indivisible (modulo 'break_long_words'), but a line break can
|
|
come between any two chunks. Chunks should not have internal
|
|
whitespace; ie. a chunk is either all whitespace or a "word".
|
|
Whitespace chunks will be removed from the beginning and end of
|
|
lines, but apart from that whitespace is preserved.
|
|
"""
|
|
lines = []
|
|
if self.width <= 0:
|
|
raise ValueError("invalid width %r (must be > 0)" % self.width)
|
|
|
|
# Arrange in reverse order so items can be efficiently popped
|
|
# from a stack of chucks.
|
|
chunks.reverse()
|
|
|
|
while chunks:
|
|
|
|
# Start the list of chunks that will make up the current line.
|
|
# cur_len is just the length of all the chunks in cur_line.
|
|
cur_line = []
|
|
cur_len = 0
|
|
|
|
# Figure out which static string will prefix this line.
|
|
if lines:
|
|
indent = self.subsequent_indent
|
|
else:
|
|
indent = self.initial_indent
|
|
|
|
# Maximum width for this line.
|
|
width = self.width - len(indent)
|
|
|
|
# First chunk on line is whitespace -- drop it, unless this
|
|
# is the very beginning of the text (ie. no lines started yet).
|
|
if self.drop_whitespace and chunks[-1].strip() == '' and lines:
|
|
del chunks[-1]
|
|
|
|
while chunks:
|
|
l = len(chunks[-1])
|
|
|
|
# Can at least squeeze this chunk onto the current line.
|
|
if cur_len + l <= width:
|
|
cur_line.append(chunks.pop())
|
|
cur_len += l
|
|
|
|
# Nope, this line is full.
|
|
else:
|
|
break
|
|
|
|
# The current line is full, and the next chunk is too big to
|
|
# fit on *any* line (not just this one).
|
|
if chunks and len(chunks[-1]) > width:
|
|
self._handle_long_word(chunks, cur_line, cur_len, width)
|
|
|
|
# If the last chunk on this line is all whitespace, drop it.
|
|
if self.drop_whitespace and cur_line and not cur_line[-1].strip():
|
|
del cur_line[-1]
|
|
|
|
# Convert current line back to a string and store it in list
|
|
# of all lines (return value).
|
|
if cur_line:
|
|
lines.append(indent + ''.join(cur_line))
|
|
|
|
return lines
|
|
# }}}
|
|
# function wrap() {{{
|
|
|
|
|
|
def wrap(text, width=70, **kwargs):
|
|
"""
|
|
Wrap a single paragraph of text, returning a list of wrapped lines.
|
|
"""
|
|
if sys.version_info < (2, 6):
|
|
return TextWrapper(width=width, **kwargs).wrap(text)
|
|
return textwrap.wrap(text, width=width, **kwargs)
|
|
|
|
# }}}
|
|
|
|
def genKeyId(inkey):
|
|
crc = binascii.crc32(bytes(inkey, encoding="UTF-8")) & 0xffffffff
|
|
# Use simple ASCII characters, exclude I, l, 1 and O, 0 to avoid confusing IDs
|
|
symbols = "ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz23456789";
|
|
outkey = ""
|
|
for keyind in range(0, 5):
|
|
outkey += symbols[(crc & 63) % len(symbols)];
|
|
crc >>= 6;
|
|
return outkey
|