#! /usr/bin/env python # # check_po - a gramps tool to check validity of po files # # Copyright (C) 2006-2006 Kees Bakker # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # TODO # # * Check for HTML text in msgstr when there is none in msgid # * Check for matching HTML tag/endtag in msgstr # import sys import re import os import io from argparse import ArgumentParser all_total = {} all_fuzzy = {} all_untranslated = {} all_percent_s = {} all_named_s = {} all_bnamed_s = {} all_context = {} all_coverage = {} all_template_coverage = {} def strip_quotes(st): if len(st) >= 2 and st[0] == '"' and st[len(st)-1] == '"': st = st.strip()[1:-1] return st class CheckException( Exception ): pass # This is a base class for all checks class Check: def __init__( self ): self.msgs = [] def diag( self ): if len( self.msgs ): print print(self.diag_header) for m in self.msgs: m.diag() def summary( self ): print("%-20s%d" % ( self.summary_text, len(self.msgs) )) class Check_fmt( Check ): def __init__( self, fmt ): Check.__init__( self ) self.diag_header = "-------- %s mismatches --------------" % fmt self.summary_text = "%s mismatches:" % fmt self.fmt = fmt def __process( self, msg, msgid, msgstr ): cnt1 = msgid.count( self.fmt ) cnt2 = msgstr.count( self.fmt ) if cnt1 != cnt2: self.msgs.append( msg ) def process( self, msg ): msgid = msg.msgid msgstr = msg.msgstr[0] self.__process( msg, msgid, msgstr ) if msg.msgidp and len(msg.msgstr) >= 2: msgid = msg.msgidp msgstr = msg.msgstr[1] self.__process( msg, msgid, msgstr ) class Check_named_fmt( Check ): # A pattern to find all %() find_named_fmt_pat = re.compile('% \( \w+ \) \d* \D', re.VERBOSE) def __init__( self ): Check.__init__( self ) self.diag_header = "-------- %() name mismatches --------------" self.summary_text = "%() name mismatches:" def __process( self, msg, msgid, msgstr ): # Same number of named formats? fmts1 = self.find_named_fmt_pat.findall( msgid ) fmts2 = self.find_named_fmt_pat.findall( msgstr ) if len( fmts1 ) != len( fmts2 ): self.msgs.append( msg ) else: # Do we have the same named formats? fmts1.sort() fmts2.sort() if fmts1 != fmts2: self.msgs.append( msg ) def process( self, msg ): msgid = msg.msgid msgstr = msg.msgstr[0] self.__process( msg, msgid, msgstr ) if msg.msgidp and len(msg.msgstr) >= 2: msgid = msg.msgidp msgstr = msg.msgstr[1] self.__process( msg, msgid, msgstr ) class Check_mapping_fmt( Check ): # A pattern to find all {} find_map_pat = re.compile('\{\w+\.?f?\[?.*?\]?\}', re.UNICODE) # needed for Russian index letters def __init__( self ): Check.__init__( self ) self.diag_header = "-------- {} name mismatches --------------" self.summary_text = "{} name mismatches:" def __process( self, msg, msgid, msgstr ): # Same number of named formats? fmts1 = self.find_map_pat.findall( msgid ) fmts2 = self.find_map_pat.findall( msgstr ) if len( fmts1 ) != len( fmts2 ): self.msgs.append( msg ) else: # Do we have the same named formats? fmts1.sort() fmts2.sort() if fmts1 != fmts2: for idx in range(len(fmts2)): fmts2[idx] = re.sub( r'\.f\[.+\]', '', fmts2[idx] ) if fmts1 != fmts2: self.msgs.append( msg ) def process( self, msg ): msgid = msg.msgid msgstr = msg.msgstr[0] self.__process( msg, msgid, msgstr ) if msg.msgidp and len(msg.msgstr) >= 2: msgid = msg.msgidp msgstr = msg.msgstr[1] self.__process( msg, msgid, msgstr ) class Check_missing_sd( Check ): # A pattern to find %() without s or d # Here is a command to use for testing # print(re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE).findall( '%(event_name)s: %(place)s%(endnotes)s. ' )) find_named_fmt_pat2 = re.compile('% \( \w+ \) \d* (\D|$)', re.VERBOSE) def __init__( self ): Check.__init__( self ) self.diag_header = "-------- %() without 's' or 'd' mismatches --------------" self.summary_text = "%() missing s/d:" def process( self, msg ): for msgstr in msg.msgstr: fmts = self.find_named_fmt_pat2.findall( msgstr ) for f in fmts: if not f in ('s', 'd'): self.msgs.append( msg ) break class Check_runaway( Check ): def __init__( self ): Check.__init__( self ) self.diag_header = "-------- Runaway context in translation ---------" self.summary_text = "Runaway context:" def __process( self, msg, msgid, msgstr ): # Runaway context. In the translated part we only want to see # the translation of the word after the | if (msgid.count('|') > 0 and msgstr.count('|') > 0 and msgid != msgstr and not (' lexeme' in msgid) # _datestrings.py keyword and not ('alternative month names ' in msgid) # this one too ): self.msgs.append( msg ) def process( self, msg ): msgid = msg.msgid msgstr = msg.msgstr[0] self.__process( msg, msgid, msgstr ) if msg.msgidp and len(msg.msgstr) >= 2: msgid = msg.msgidp msgstr = msg.msgstr[1] self.__process( msg, msgid, msgstr ) class Check_xml_chars( Check ): # Special XML characters # It is not allowed to have a quote, an ampersand or an angle bracket xml_chars_pat = re.compile( r'<(?!(b>|/b>|i>|/i>|br/>)) | (?<=!( | " | & (?!(quot|nbsp|gt|amp);)', re.VERBOSE ) def __init__( self ): Check.__init__( self ) self.diag_header = "-------- unescaped XML special characters ---------" self.summary_text = "XML special chars:" def process( self, msg ): msgid = msg.msgid msgstr = msg.msgstr[0] # XML errors # Only look at messages in the tips.xml if msg.is_tips_xml: if self.xml_chars_pat.search( msgstr ): self.msgs.append( msg ) class Check_last_char( Check ): def __init__( self ): Check.__init__( self ) self.diag_header = "-------- last character not identical ---------" self.summary_text = "Last character:" def __process( self, msg, msgid, msgstr ): msgid_last = msgid[-1:] msgstr_last = msgstr[-1:] if msgid_last.isspace() != msgstr_last.isspace(): self.msgs.append( msg ) elif (msgid_last == '.') != (msgstr_last == '.' or msgstr_last == '\u3002' ): # Chinese self.msgs.append( msg ) elif (msgid_last == ':') != (msgstr_last == ':' or msgstr_last == '\uff1a' ): # Chinese self.msgs.append( msg ) elif (msgid_last == ')') != (msgstr_last == ')' or msgstr_last == '\uff09' ): # Chinese self.msgs.append( msg ) def process( self, msg ): # Last character of msgid? White space? Period? if msg.is_fuzzy: return msgid = msg.msgid msgstr = msg.msgstr[0] self.__process( msg, msgid, msgstr ) if msg.msgidp and len(msg.msgstr) >= 2: msgid = msg.msgidp msgstr = msg.msgstr[1] self.__process( msg, msgid, msgstr ) class Check_shortcut_trans( Check ): def __init__( self ): Check.__init__( self ) self.diag_header = "-------- shortcut key in translation ---------" self.summary_text = "Shortcut in msgstr:" def __process( self, msg, msgid, msgstr ): if msgid.count('_') == 0 and msgstr.count('_') > 0: self.msgs.append( msg ) def process( self, msg ): msgid = msg.msgid msgstr = msg.msgstr[0] self.__process( msg, msgid, msgstr ) if msg.msgidp and len(msg.msgstr) >= 2: msgid = msg.msgidp msgstr = msg.msgstr[1] self.__process( msg, msgid, msgstr ) class Msgid: fuzzy_pat = re.compile( 'fuzzy' ) tips_xml_pat = re.compile( r'tips\.xml' ) def __init__( self, msgnr, lineno ): self._msgctxt = [] # For debugging purpose the original text self._msgid = [] # For debugging purpose the original text self._msgidp = [] # For debugging purpose the original text self._msgstr = [] # For debugging purpose the original text self.msgctxt = '' self.msgid = '' self.msgidp = '' self.msgstr = [] # This is a list to support plural self._cmnt = [] self.nr = msgnr self.lineno = lineno self.is_fuzzy = 0 self.is_tips_xml = 0 def diag( self ): print print("msg nr: %d, lineno: %d%s" % ( self.nr, self.lineno, self.is_fuzzy and " (fuzzy)" or "" )) sys.stdout.write( ''.join( self._msgid ) ) sys.stdout.write( ''.join( self._msgidp ) ) if sys.version_info[0] < 3: sys.stdout.write(''.join(self._msgstr).encode('utf-8')) else: sys.stdout.write( ''.join( self._msgstr ) ) def add_msgctxt( self, line, lineno ): self._msgctxt.append( line ) line = re.sub( r'msgctxt\s+', '', line ) line = line.strip() if line[0] != '"' or line[-1:] != '"': print("ERROR at line %d: Missing quote." % lineno) line = strip_quotes( line ) self.msgctxt += line def add_msgid( self, line, lineno ): self._msgid.append( line ) line = re.sub( r'msgid\s+', '', line ) line = line.strip() if line[0] != '"' or line[-1:] != '"': print("ERROR at line %d: Missing quote." % lineno) line = strip_quotes( line ) self.msgid += line def add_msgidp( self, line, lineno ): self._msgidp.append( line ) line = re.sub( r'msgid_plural\s+', '', line ) line = line.strip() if line[0] != '"' or line[-1:] != '"': print("ERROR at line %d: Missing quote." % lineno) line = strip_quotes( line ) self.msgidp += line def add_new_msgstr( self, line, lineno ): self.msgstr.append( '' ) # Start a new msgstr self.add_msgstr( line, lineno ) def add_msgstr( self, line, lineno ): self._msgstr.append( line ) line = re.sub( r'msgstr(\[\d\])?\s+', '', line ) line = line.strip() if line[0] != '"' or line[-1:] != '"': print("ERROR at line %d: Missing quote." % lineno) line = strip_quotes( line ) self.msgstr[-1] += line def add_cmnt( self, line ): self._cmnt.append( line ) if not self.is_fuzzy and self.fuzzy_pat.search( line ): self.is_fuzzy = 1 if not self.is_tips_xml and self.tips_xml_pat.search( line ): self.is_tips_xml = 1 def create_new_Msgid( msgs, lineno ): msg = Msgid( len(msgs), lineno ) msgs.append( msg ) return msg def read_msgs( fname ): empty_pat = re.compile( r'^ \s* $', re.VERBOSE ) comment_pat = re.compile( r'\#', re.VERBOSE ) msgctxt_pat = re.compile( r'msgctxt \s+ "', re.VERBOSE ) msgid_pat = re.compile( r'msgid \s+ "', re.VERBOSE ) msgid_plural_pat = re.compile( r'msgid_plural \s+ "', re.VERBOSE ) msgstr_pat = re.compile( r'msgstr (\[\d\])? \s+ "', re.VERBOSE ) str_pat = re.compile( r'"', re.VERBOSE ) old_pat = re.compile( r'\#~ \s+ ', re.VERBOSE ) f = io.open( fname, encoding='utf-8' ) lines = f.readlines() # parse it like a statemachine NONE = 'NONE' # Nothing detected, yet CMNT = 'CMNT' # Inside comment part MSGCTXT = 'msgctxt' # Inside msgctxt part MSGID = 'msgid' # Inside msgid part MSGIDP = 'msgid_plural' # Inside msgid_plural part MSGSTR = 'msgstr' # Inside msgstr part STR = 'STR' # A continuation string OLD = 'OLD' # An old pattern with #~ global msgs state = NONE msg = None msgs = [] for ix, line in enumerate( lines ): # Use line numbers for messages lineno = ix + 1 m = empty_pat.match( line ) if m: continue # Empty lines are not interesting # What's the next state? if old_pat.match( line ): next_state = OLD elif comment_pat.match( line ): next_state = CMNT elif msgctxt_pat.match( line ): next_state = MSGCTXT elif msgid_pat.match( line ): next_state = MSGID elif msgid_plural_pat.match( line ): next_state = MSGIDP elif msgstr_pat.match( line ): next_state = MSGSTR elif str_pat.match( line ): next_state = STR else: print('WARNING: Unexpected input at %(fname)s:%(lineno)d' % vars()) next_state = NONE #print("%(state)d->%(next_state)d\t%(line)s" % vars()) if state == NONE: # expect msgctxt, msgid, comment or old stuff if next_state == CMNT: state = CMNT # Start with an empty new item msg = create_new_Msgid( msgs, lineno ) msg.add_cmnt( line ) elif next_state == MSGCTXT: state = MSGCTXT # Start with an empty new item msg = create_new_Msgid( msgs, lineno ) msg.add_msgctxt( line, lineno ) elif next_state == MSGID: state = MSGID # Start with an empty new item msg = create_new_Msgid( msgs, lineno ) msg.add_msgid( line, lineno ) elif next_state == MSGIDP: raise CheckException( 'Unexpected %(next_state)s ' 'at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGSTR: print('WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars()) state = MSGSTR # Start with an empty new item msg = create_new_Msgid( msgs, lineno ) msg.add_new_msgstr( line, lineno ) elif next_state == STR: print('WARNING: Wild string at %(fname)s:%(lineno)d' % vars()) elif next_state == OLD: pass # Just skip else: raise CheckException( 'Unexpected state in po parsing ' '(state = %(state)s)' % vars() ) elif state == CMNT: # Expect more comment, msgctxt, or msgid. # If msgstr or string it is flagged as error. if next_state == CMNT: if msg: msg.add_cmnt( line ) else: # Note. We may need to do something about these comments # Skip for now pass elif next_state == MSGCTXT: state = MSGCTXT if not msg: # Start with an empty new item msg = create_new_Msgid( msgs, lineno ) msg.add_msgctxt( line, lineno ) elif next_state == MSGID: state = MSGID if not msg: # Start with an empty new item msg = create_new_Msgid( msgs, lineno ) msg.add_msgid( line, lineno ) elif next_state == MSGIDP: raise CheckException( 'Unexpected %(next_state)s ' 'at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGSTR: print('WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars()) state = MSGSTR # Start with an empty new item msg = create_new_Msgid( msgs, lineno ) msg.add_new_msgstr( line, lineno ) elif next_state == STR: print('WARNING: Wild string at %(fname)s:%(lineno)d' % vars()) elif next_state == OLD: msg = None pass # Just skip else: raise CheckException( 'Unexpected state in po parsing ' '(state = %(state)s)' % vars() ) elif state == MSGCTXT: # Expect more msgctxt or msgid. # If msgstr or string it is flagged as error. if next_state == CMNT: # Hmmm. A comment here? print('WARNING: Unexpted comment ' 'at %(fname)s:%(lineno)d' % vars()) elif next_state == MSGCTXT: raise CheckException( 'Unexpected %(next_state)s ' 'at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGID: state = MSGID msg.add_msgid( line, lineno ) elif next_state == MSGIDP: raise CheckException( 'Unexpected %(next_state)s ' 'at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGSTR: print('WARNING: Wild msgstr at %(fname)s:%(lineno)d' % vars()) state = MSGSTR # Start with an empty new item msg = create_new_Msgid( msgs, lineno ) msg.add_new_msgstr( line, lineno ) elif next_state == STR: print('WARNING: Wild string at %(fname)s:%(lineno)d' % vars()) elif next_state == STR: # Continuation of msgctxt, stay in state MSGCTXT msg.add_msgctxt( line, lineno ) else: raise CheckException( 'Unexpected state in po parsing ' '(state = %(state)s)' % vars() ) elif state == MSGID: # Expect msgstr or msgid_plural or string if next_state == CMNT: # Hmmm. A comment here? print('WARNING: Unexpted comment ' 'at %(fname)s:%(lineno)d' % vars()) elif next_state == MSGCTXT: raise CheckException( 'Unexpected %(next_state)s ' 'at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGID: raise CheckException( 'Unexpected %(next_state)s ' 'at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGIDP: state = MSGIDP msg.add_msgidp( line, lineno ) elif next_state == MSGSTR: state = MSGSTR msg.add_new_msgstr( line, lineno ) elif next_state == STR: # Continuation of msgid, stay in state MSGID msg.add_msgid( line, lineno ) elif next_state == OLD: msg = None pass # Just skip else: raise CheckException( 'Unexpected state in po parsing ' '(state = %(state)s)' % vars() ) elif state == MSGIDP: # Expect msgstr or string or comment if next_state == CMNT: # Hmmm. A comment here? print('WARNING: Unexpected comment ' 'at %(fname)s:%(lineno)d' % vars()) elif next_state == MSGCTXT: raise CheckException( 'Unexpected %(next_state)s ' 'at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGID: raise CheckException( 'Unexpected %(next_state)s ' 'at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGIDP: raise CheckException( 'Unexpected %(next_state)s ' 'at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGSTR: state = MSGSTR msg.add_new_msgstr( line, lineno ) elif next_state == STR: # Continuation of msgid_plural, stay in state MSGIDP msg.add_msgidp( line, lineno ) elif next_state == OLD: msg = None pass # Just skip else: raise CheckException( 'Unexpected state in po parsing ' '(state = %(state)s)' % vars() ) elif state == MSGSTR: # Expect comment, or msgid, or string. if next_state == CMNT: # A comment probably starts a new item state = CMNT msg = create_new_Msgid( msgs, lineno ) msg.add_cmnt( line ) elif next_state == MSGCTXT: state = MSGCTXT msg = create_new_Msgid( msgs, lineno ) msg.add_msgctxt( line, lineno ) elif next_state == MSGID: state = MSGID msg = create_new_Msgid( msgs, lineno ) msg.add_msgid( line, lineno ) elif next_state == MSGIDP: raise CheckException( 'Unexpected %(next_state)s ' 'at %(fname)s:%(lineno)d' % vars() ) elif next_state == MSGSTR: # New msgstr, probably for plural form # Stay in MSGSTR state msg.add_new_msgstr( line, lineno ) elif next_state == STR: msg.add_msgstr( line, lineno ) elif next_state == OLD: msg = None pass # Just skip else: raise CheckException( 'Unexpected state in po parsing ' '(state = %(state)s)' % vars() ) else: raise CheckException( 'Unexpected state in po parsing ' '(state = %(state)s)' % vars() ) # Strip items with just comments. (Can this happen?) msgs1 = [] for m in msgs: if not m.msgid and not m.msgstr: #print("INFO: No msgid or msgstr at %s:%s" % ( fname, m.lineno )) pass else: msgs1.append( m ) msgs = msgs1 return msgs def analyze_msgs( args, fname, msgs, nr_templates = None, nth = 0 ): nr_fuzzy = 0 nr_untranslated = 0 checks = [] checks.append( Check_fmt( '%s' ) ) checks.append( Check_fmt( '%d' ) ) checks.append( Check_named_fmt() ) checks.append( Check_mapping_fmt() ) checks.append( Check_missing_sd() ) checks.append( Check_runaway() ) checks.append( Check_xml_chars() ) checks.append( Check_last_char() ) checks.append( Check_shortcut_trans() ) for msg in msgs: msgid = msg.msgid msgstr = msg.msgstr #print #print("msgid: %(msgid)s" % vars()) #print("msgstr: %(msgstr)s" % vars()) if ''.join(msgstr) == '': nr_untranslated += 1 continue if msg.is_fuzzy: nr_fuzzy += 1 continue for c in checks: c.process( msg ) nr_msgs = len(msgs) if nth > 0: print print("=====================================") print("%-20s%s" % ( "File:", fname )) print("%-20s%d" % ( "Template total:", nr_templates )) print("%-20s%d" % ( "PO total:", nr_msgs )) print("%-20s%d" % ( "Fuzzy:", nr_fuzzy )) print("%-20s%d" % ( "Untranslated:", nr_untranslated )) for c in checks: c.summary() po_coverage = (1.0 - (float(nr_untranslated) / float(nr_msgs))) * 100 print("%-20s%5.2f%%" % ( "PO Coverage:", po_coverage )) template_coverage = po_coverage * float(nr_msgs) / float(nr_templates) print("%-20s%5.2f%%" % ( "Template Coverage:", template_coverage )) not_displayed = nr_untranslated + nr_fuzzy translation = (1.0 - (float(not_displayed) / float(nr_templates))) * 100 text = "%-20s%5.2f%%" % ( "Localized at:", translation) if int(template_coverage*1000) == int(po_coverage*1000): print(text) else: print(text + ' (previous gramps.pot)') for c in checks: c.diag() def main(): parser = ArgumentParser(description='This program validates ' 'a PO file for GRAMPS.') parser.add_argument("-s", dest="summary", choices=[file for file in os.listdir('.') if file.endswith('.po')], default=False, help="the summary of check, " "and if need, it gives details") args = parser.parse_args() if args.summary: files = sys.argv[2:] try: pot_msgs = read_msgs( 'gramps.pot' ) nr_templates = len( pot_msgs ) nth = 0 for fname in files: msgs = read_msgs( fname ) analyze_msgs( files, fname, msgs, nr_templates, nth ) nth += 1 except CheckException as e: print('Oops.', e) print('Bailing out') if __name__ == "__main__": main()