#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import re import csv import glob import codecs import difflib import argparse import xml.etree.ElementTree as etree import xml.dom.minidom as mdom """ Naming conventions: 1. <language code>-<country code>-<brand-name>.csv 2. We will follow ISO639-2 http://www.loc.gov/standards/iso639-2/php/code_list.php for language names and ISO 3166-1-alpha-2 http://www.iso.org/iso/en/prods-services/iso3166ma/02iso-3166-code-lists/list-en1.html for country codes. 3. Only language code is mandatory. Other is optional We will distinguish master "en.csv" file and other translations files. 1. "en.csv" will contain rows: "key | en strings | en string or nothing | platform" 2. Other files will contain "key | en string | translation | optional comment" 3. We will ignore last column on all languages except main "en" file """ class Polyglot: """ This class contains global constants and global settings that was received from command line """ class Platform: Android = "android" Apple = "ios" Windows = "wp" Blackberry57 = "bb" Qt = "qt" Any = "any" class Action: Generate = "generate" Deduplicate = "simplify" Analyze = "analyze" # Consts AliasIndex = 0 EnglishInxed = 1 TranslationIndex = 2 PlathformIndex = 3 CommentIndex = 4 SourceFileExt = '.csv' MasterFileName = 'en' + SourceFileExt SupportedPlatforms = [Platform.Android, Platform.Apple, Platform.Windows, Platform.Blackberry57] Actions = [Action.Generate, Action.Analyze] BlackberryPackage = 'com.PGLtd.strings' BaseNames = { Platform.Android: "strings", Platform.Apple: "Localizable", Platform.Windows: "LocalizedStrings", Platform.Blackberry57: "LocalizedStrings" } # Vars CsvRootPath = None CsvEnPath = None CsvXXPath = [] MasterDirName = None OutputRoot = './' @staticmethod def csv_reader_from_file(csv_file): """ this method handle links on windows also """ try: dialect = csv.Sniffer().sniff(csv_file.read(1024), delimiters=";,") csv_file.seek(0) return csv.reader(csv_file, dialect) except (csv.Error, UnicodeDecodeError) as e: # process symbolic links on windows (exception may be because symlink) if os.name == 'nt': with open(csv_file.name, 'rb') as link_file: data = link_file.readline() target_bytes = [] link_type = "" if data.startswith(b'!<symlink>\xFF\xFE'): link_type = "Cygwin" target_bytes = data.replace(b'!<symlink>\xFF\xFE', b'').replace(b'\x00', b'') else: link_type = "Unix" target_bytes = data target = os.path.dirname(link_file.name) + os.sep + target_bytes.decode('UTF-8') print ("%s symbolic link %s -> %s" % (link_type, link_file.name, target)) real_csv_file = open(target, encoding='UTF-8') return Polyglot.csv_reader_from_file(real_csv_file) @staticmethod def init(cmd_args): csv_path = cmd_args.path Polyglot.MasterDirName = cmd_args.master_dir if not (os.path.exists(csv_path)): raise ValueError("Path '" + csv_path + "' noes not exists. Please specify correct") if os.path.isdir(csv_path): # batch-mode Polyglot.CsvRootPath = csv_path[:-1] if csv_path.endswith('/') else csv_path if os.path.exists(os.path.join(Polyglot.CsvRootPath, Polyglot.MasterFileName)): Polyglot.CsvEnPath = os.path.join(Polyglot.CsvRootPath, Polyglot.MasterFileName) for csv_basename in glob.glob(Polyglot.CsvRootPath + '/*' + Polyglot.SourceFileExt): # print(os.path.abspath(csv_basename)) Polyglot.CsvXXPath.append(os.path.abspath(csv_basename)) else: raise ValueError("Unable to find '{0}'. Exit.".format(csv_path + '/' + Polyglot.MasterFileName)) else: # single file Polyglot.CsvRootPath = os.path.dirname(csv_path) Polyglot.CsvXXPath.append(os.path.abspath(csv_path)) if os.path.exists(os.path.join(Polyglot.CsvRootPath, Polyglot.MasterFileName)): # print(os.path.abspath(Polyglot.CsvEnPath)) Polyglot.CsvEnPath = os.path.join(Polyglot.CsvRootPath, Polyglot.MasterFileName) else: raise ValueError("Unable to find '{0}'. Exit.".format(csv_path + '/' + Polyglot.MasterFileName)) Polyglot.OutputRoot = args.output_dir Polyglot.FileBaseName = 'strings' Polyglot.BlackberryPackage = args.blackberry_package class AbstractBuilder: def __init__(self): self.template_fixing_enabled = True def add_string(self, key, value, comment): raise NotImplementedError("method 'add_string' not implemented in subclass") def get_result(self, output, lc, cc): raise NotImplementedError("method 'get_result' not implemented in subclass") def fix_template_placeholder(self, value): raise NotImplementedError("method 'fix_template' not implemented in subclass") @staticmethod def prettify(xml): xml_byte = etree.tostring(xml, encoding='UTF-8', method='xml') xml_str = xml_byte.decode(encoding='UTF-8') xml_again = mdom.parseString(xml_str) pretty_xml_str = xml_again.toprettyxml(indent='\t', encoding='UTF-8').decode(encoding='UTF-8') return pretty_xml_str def fix_template(self, value): if self.template_fixing_enabled: index = 0 for param in re.findall(r"\{[A-Za-z0-9_ ]+\}", value): value = value.replace(param, self.fix_template_placeholder(param, index)) index += 1 return value class AndroidBuilder(AbstractBuilder): """ http://developer.android.com/guide/topics/resources/string-resource.html """ def __init__(self): self.output_xml = etree.Element("resources") self.platform = Polyglot.Platform.Android self.template_fixing_enabled = False self.target_file = "{output}/android/res/values-{lc}{cc}/{basename}.xml" def add_string(self, key, value, comment): if comment: comment_node = etree.Comment(comment) self.output_xml.append(comment_node) value = self.fix_template(value) string_node = etree.SubElement(self.output_xml, "string") string_node.set("name", key) # escape symbols string_node.text = value.replace('\'', '\\\'') def fix_template_placeholder(self, value, index): """ http://developer.android.com/reference/android/content/res/Resources.html#getQuantityString(int, int, java.lang.Object...) """ return "{{0}}".format(str(index)) def get_result(self, output, lc, cc): # fix for case when cc missing cc = '-r' + cc if cc else '' return {self.target_file.format(output=output, basename=Polyglot.BaseNames[self.platform], lc=lc, cc=cc): AbstractBuilder.prettify( self.output_xml)} class IOSBuilder(AbstractBuilder): """ https://developer.apple.com/library/mac/documentation/Cocoa/Conceptual/LoadingResources/Strings/Strings.html """ def __init__(self): self.output_plain = '' self.platform = Polyglot.Platform.Apple self.template_fixing_enabled = False self.target_file = "{output}/ios/{lc}{cc}.lproj/{basename}.strings" def add_string(self, key, value, comment): entry = '' if comment: entry += "/* {0} */\n".format(comment) value = self.fix_template(value) # escape symbols value = value.replace('\"', '\\\"') value = value.replace('\\u0020', ' ') # fast solution, need to find long term solution entry += "\"{alias}\" = \"{value}\";\n".format(alias=key, value=value) self.output_plain += entry def fix_template_placeholder(self, value, index): """ https://developer.apple.com/library/mac/documentation/Cocoa/Conceptual/Strings/Articles/FormatStrings.html """ return '%@' def get_result(self, output, lc, cc): # fix for case when cc missing cc = '-' + cc if cc else '' return {self.target_file.format(output=output, basename=Polyglot.BaseNames[self.platform], lc=lc, cc=cc): self.output_plain} class ResXBuilder(AbstractBuilder): """ http://msdn.microsoft.com/en-us/library/ekyft91f.aspx """ def __init__(self): self.output_xml = etree.Element('root') self.platform = Polyglot.Platform.Windows self.template_fixing_enabled = True self.target_file = "{output}/wp/{basename}.{lc}{cc}.resx" # build header # internal XML schema missing resheader_node = etree.SubElement(self.output_xml, 'resheader') resheader_node.set('name', 'resmimetype') resheader_node = etree.SubElement(resheader_node, 'value') resheader_node.text = 'text/microsoft-resx' resheader_node = etree.SubElement(self.output_xml, 'resheader') resheader_node.set('name', 'version') resheader_node = etree.SubElement(resheader_node, 'value') resheader_node.text = '2.0' resheader_node = etree.SubElement(self.output_xml, 'resheader') resheader_node.set('name', 'reader') resheader_node = etree.SubElement(resheader_node, 'value') resheader_node.text = 'System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, ' \ 'Culture=neutral, PublicKeyToken=b77a5c561934e089' resheader_node = etree.SubElement(self.output_xml, 'resheader') resheader_node.set('name', 'writer') resheader_node = etree.SubElement(resheader_node, 'value') resheader_node.text = 'System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, ' \ 'Culture=neutral, PublicKeyToken=b77a5c561934e089' def add_string(self, key, value, comment): key_node = etree.SubElement(self.output_xml, 'data') key_node.set('name', key) key_node.set('xml:space', 'preserve') value = self.fix_template(value) value = value.replace('\\u0020', ' ') # fast solution, need to find long term solution value_node = etree.SubElement(key_node, 'value') value_node.text = value if comment: comment_node = etree.SubElement(key_node, 'comment') comment_node.text = comment def fix_template_placeholder(self, value, index): """ http://msdn.microsoft.com/en-us/library/system.string.format.aspx """ return '{{0}}'.format(str(index)) def get_result(self, output, lc, cc): # fix for case when cc missing cc = '-' + cc if cc else '' return {self.target_file.format(output=output, basename=Polyglot.BaseNames[self.platform], lc=lc, cc=cc): AbstractBuilder.prettify( self.output_xml)} class BlackBerry57Builder(AbstractBuilder): """ https://developer.blackberry.com/bbos/java/documentation/localize_apps_2006594_11.html """ def __init__(self): self.header = 'package {pkg};\n\n'.format(pkg=Polyglot.BlackberryPackage) self.impl = '' self.header_idx = 0 self.platform = Polyglot.Platform.Blackberry57 self.template_fixing_enabled = True self.target_header_file = "{output}/bb/res/{bb_package_path}/{basename}.rrh" self.target_source_file = "{output}/bb/res/{bb_package_path}/{basename}_{lc}{cc}.rrc" def add_string(self, key, value, comment): header_entry_template = '{alias}#0={index};\n' impl_entry_template = '{alias}#0="{value}";\n' value = self.fix_template(value) # escape symbols value = value.replace('\"', '\\\"') self.header += header_entry_template.format(alias=key, index=self.header_idx) self.impl += impl_entry_template.format(alias=key, value=value) self.header_idx += 1 def fix_template_placeholder(self, value, index): """ http://www.blackberry.com/developers/docs/4.5.0api/javax/microedition/global/Formatter.html """ return "{{0}}".format(str(index)) def get_result(self, output, lc, cc): if self.impl: # fix for case when cc missing cc = '_' + cc if cc else '' bb_package_path = Polyglot.BlackberryPackage.replace('.', '/') header_path = self.target_header_file.format(output=output, bb_package_path=bb_package_path, basename=Polyglot.BaseNames[self.platform]) source_path = self.target_source_file.format(output=output, bb_package_path=bb_package_path, basename=Polyglot.BaseNames[self.platform], lc=lc, cc=cc) return {header_path: self.header, source_path: self.impl} else: return {} class QtBuilder(AbstractBuilder): """ http://qt-project.org/doc/qt-4.8/linguist-ts-file-format.html """ def __init__(self): self.output_xml = etree.Element('TS') self.output_xml.set('version', '2.1') self.platform = Polyglot.Platform.Qt self.context = etree.SubElement(self.output_xml, 'context') name = etree.SubElement(self.context, 'name') name.text = '!AUTO GENERATED, FIXME IF YOU CAN!' self.template_fixing_enabled = False self.target_source_file = "{output}/qt/{basename}_{lc}{cc}.tc" def add_string(self, key, value, comment): message = etree.SubElement(self.context, 'message') source_node = etree.SubElement(message, 'source') source_node.text = key if comment: comment_node = etree.SubElement(message, 'comment') comment_node.text = comment value = self.fix_template(value) translation_node = etree.SubElement(message, 'translation') translation_node.text = value def get_result(self, output, lc, cc): # fix for case when cc missing cc = "_" + cc if cc else "" self.output_xml.set('language', lc + cc) return {self.target_file.format(output=output, basename=Polyglot.BaseNames[self.platform], lc=lc, cc=cc): AbstractBuilder.prettify( self.output_xml)} class Worker: """ Generator for single file """ def __init__(self, platforms, lc, cc): self.builders = [] self.lc = lc self.cc = cc for p in platforms: if p == Polyglot.Platform.Android: self.builders.append(AndroidBuilder()) elif p == Polyglot.Platform.Apple: self.builders.append(IOSBuilder()) elif p == Polyglot.Platform.Windows: self.builders.append(ResXBuilder()) elif p == Polyglot.Platform.Blackberry57: self.builders.append(BlackBerry57Builder()) elif p == Polyglot.Platform.Qt: self.builders.append(QtBuilder()) else: print("Unsupported platform '" + p + "'. Please try one of " + str(Polyglot.SupportedPlatforms)) def process_row(self, row, enable_comments, pk_map): try: key = row[Polyglot.AliasIndex] value = row[Polyglot.TranslationIndex] platforms = pk_map[key] if key in pk_map else ' '.join(Polyglot.SupportedPlatforms) comment = row[Polyglot.CommentIndex] if enable_comments and len(row) > 4 else None for b in self.builders: if b.platform in platforms: # FIX unprocessed double quote escaping # http://stackoverflow.com/questions/7334752/problem-due-to-double-quote-while-parsing-csv while '""' in value: value = value.replace('""', '"') b.add_string(key, value, comment) except IndexError: print("Error row=" + str(row)) def process(self, csv_path, enable_comments, pk_map): """ @arg csv_path @arg enable_comments @arg pk_map - platform/key map """ existing_keys = [] with open(csv_path, encoding='UTF-8') as csv_file: reader = Polyglot.csv_reader_from_file(csv_file) for row in reader: existing_keys.append(row[0]) self.process_row(row, enable_comments, pk_map) if Polyglot.MasterDirName: master_csv_path = os.path.join(Polyglot.MasterDirName, os.path.basename(csv_path)) with open(master_csv_path, encoding='UTF-8') as csv_file: reader = Polyglot.csv_reader_from_file(csv_file) for row in reader: if row[0] not in existing_keys: self.process_row(row, enable_comments, pk_map) for b in self.builders: output_files = b.get_result(Polyglot.OutputRoot, self.lc, self.cc) for file_name in output_files.keys(): new_filename = file_name idx = 1 while os.path.exists(new_filename): file_name_comps = os.path.splitext(file_name) new_filename = file_name_comps[0] + ' ' + str(idx) + file_name_comps[1] idx += 1 dir_name = os.path.dirname(os.path.abspath(new_filename)) if not os.path.exists(dir_name): os.makedirs(dir_name, exist_ok=True) file = codecs.open(new_filename, mode='w', encoding='UTF-8') file.write(output_files[file_name]) file.close() print("Generated '{0}'".format(new_filename)) class Director: """ Manage worker objects """ def __init__(self, platforms, enable_comments): if Polyglot.Platform.Any in platforms: del platforms[:] platforms.extend(Polyglot.SupportedPlatforms) self.uplatforms = set(platforms) self.comments = enable_comments def process(self): pk_map = Director.build_platforms_map(Polyglot.CsvEnPath) # TODO multi threading for csv_path in Polyglot.CsvXXPath: print("Start processing resources '{0}'".format(csv_path)) lc, cc = Director.find_lc_and_cc(csv_path) w = Worker(self.uplatforms, lc, cc) w.process(csv_path, self.comments, pk_map) # Note for BB 5.0 if Polyglot.Platform.Blackberry57 in self.uplatforms: print("\nNOTE: If your Blackberry target os is 5.0 or lower:\n" " 1. make sure that for each string_xx_YY.rrc you have it copy with name string_xx.rrc.\n" " 2. make sure that for each string_xx_YY.rrh you have it copy with name string_xx.rrh.") @staticmethod def build_platforms_map(csv_path): """ only english csv contains information about platforms for particular string this method prepare map with this kind of information """ result = {} with open(csv_path, encoding='UTF-8') as csv_file: reader = Polyglot.csv_reader_from_file(csv_file) for row in reader: key = row[Polyglot.AliasIndex] platforms = row[Polyglot.PlathformIndex] result[key] = platforms if Polyglot.MasterDirName: master_csv_path = os.path.join(Polyglot.MasterDirName, os.path.basename(csv_path)) with open(master_csv_path, encoding='UTF-8') as csv_file: reader = Polyglot.csv_reader_from_file(csv_file) for row in reader: key = row[Polyglot.AliasIndex] platforms = row[Polyglot.PlathformIndex] result[key] = platforms return result @staticmethod def find_lc_and_cc(csv_path): csv_filename = os.path.splitext(os.path.basename(csv_path))[0] lccc = [i for i in csv_filename.split('-') if len(i) == 2] return lccc[0], lccc[1] if len(lccc) > 1 else None class Analyzer: def process(self): print("Start analyzing resources '{0}'".format(Polyglot.CsvRootPath)) alias, book, adup = self.alias_duplicates_detector(Polyglot.CsvEnPath) exact, fuzzy = self.string_duplicates_detector(alias, book) print( " Exact duplicates [{0}]: \n\t{1}\n\n Fuzzy duplicates [{2}]: \n\t{3}\n\n Alias duplicates [{4}]: \n\t{5}\n" .format(len(exact), '\n\t'.join(["'{0}' keys={1}".format(key, str(set(value))) for (key, value) in exact.items()]), len(fuzzy), '\n\t'.join(["'{0}' ~ {1}".format(key, str(set(value))) for (key, value) in fuzzy.items()]), len(adup), '\n\t'.join(adup) if len(adup) > 0 else 'Good!')) if len(Polyglot.CsvXXPath) > 1: # integrity possible only for batch-mode integrity = self.integrity_check(set(alias)) print(" Integrity report [{0}]: ".format(len(integrity.keys()))) for xx in integrity.keys(): missing = integrity[xx][0] redundant = integrity[xx][1] print("\tFile '{0}' ".format(xx)) if len(missing) == 0 and len(redundant) == 0: print("\t\tGood!") else: if len(missing) > 0: print("\t\tMissing : " + str(missing)) if len(redundant) > 0: print("\t\tRedundant : " + str(redundant)) def alias_duplicates_detector(self, csv_path): book = [] aliases = [] alias_duplicates = [] with open(csv_path, encoding='UTF-8') as csv_file: reader = Polyglot.csv_reader_from_file(csv_file) for row in reader: if row[Polyglot.AliasIndex] in aliases: alias_duplicates.append(row[Polyglot.AliasIndex]) aliases.append(row[Polyglot.AliasIndex]) book.append(row[Polyglot.TranslationIndex]) return aliases, book, alias_duplicates def string_duplicates_detector(self, alias, book): exact_duplicates = {} fuzzy_duplicates = {} index = 0 for word in book: candidates = list(book) candidates.remove(word) results = difflib.get_close_matches(word, candidates, 5, 0.86) # a little bit magic if len(results) > 0: if word in results: # exact duplicate if word in exact_duplicates.keys(): exact_duplicates[word].append(alias[index]) else: exact_duplicates[word] = [alias[index]] else: # fuzzy duplicate for result in results: if result in fuzzy_duplicates.keys() and word in fuzzy_duplicates[result]: # need to avoid loop A -> B && B -> A pass elif word not in fuzzy_duplicates.keys(): fuzzy_duplicates[word] = [result] elif word in fuzzy_duplicates.keys() and len(fuzzy_duplicates[word]) == 0: fuzzy_duplicates[word] = [result] else: fuzzy_duplicates[word].append(result) index += 1 return exact_duplicates, fuzzy_duplicates def integrity_check(self, alias): # { filename : [[missing_keys], [redundant_keys]] } report = {} for csv_path in Polyglot.CsvXXPath: xx_alias = [] with open(csv_path, encoding='UTF-8') as csv_file: reader = Polyglot.csv_reader_from_file(csv_file) for row in reader: xx_alias.append(row[Polyglot.AliasIndex]) xx_missing = [] for a in alias: if a in xx_alias: xx_alias.remove(a) else: xx_missing.append(a) xx_redundant = xx_alias report[csv_path] = [xx_missing, xx_redundant] return report class Simplifier: def process(self, csv_path): with open(csv_path, encoding='UTF-8') as csv_file: dialect = csv.Sniffer().sniff(csv_file.read(1024), delimiters=";,") csv_file.seek(0) reader = csv.reader(csv_file, dialect) rows_dict = {} for row in reader: word = row[Polyglot.EnglishInxed] if word in rows_dict.keys(): first = rows_dict[word] aliases = self.select_alias(first[0], row[0]) platforms = self.merge_platforms(first[2], row[2]) first[0] = aliases[0] first[2] = platforms if len(first) < 4: first.append('') first[3] = first[3] + '/' + aliases[1] pass else: rows_dict[word] = row pass csv_result_path = '{0}.temp'.format(csv_path) with open(csv_result_path, mode='x', encoding='UTF-8') as csv_result_file: writer = csv.writer(csv_result_file, dialect) writer.writerows(rows_dict.values()) def merge_platforms(self, pl1, pl2, separator=' '): """ Just merge two space separated list :param pl1: first string :param pl2: second string :return: """ p1 = pl1.split(separator) p2 = pl2.split(separator) p3 = set(p1 + p2) return ' '.join(p3) def select_alias(self, a1, a2): """ This method sorts aliases by priority. android aliases will win by condition written below :param a1: first alias :param a2: second alias :return: aliases due to it's priority """ a1_w = 0 a2_w = 0 # if you have space inside its bad if ' ' in a1: a1_w -= 2 if ' ' in a2: a2_w -= 2 # if you have upper letter inside it less worse for c in a1: if c.isupper(): a1_w -= 1 for c in a2: if c.isupper(): a2_w -= 1 return [a1, a2] if a1_w > a2_w else [a2, a1] if __name__ == "__main__": parser = argparse.ArgumentParser(prog='polyglot.py', add_help=True, formatter_class=argparse.RawDescriptionHelpFormatter, description='Tool that help generate static strings file for different platforms ' + str(Polyglot.SupportedPlatforms), epilog='\nExamples of Use:\n' \ '\t./polyglot.py -p examples/en.csv - specify single file for generation single file\n' \ '\t./polyglot.py -p examples - specify directory for batch processing, many files will be generated\n' \ '\t./polyglot.py -p examples/fr.csv -m pg - in generated files strings form pg/fr.csv dir will be overwriten by stanley/fr.csv \n' \ '\t./polyglot.py -a analyze -p examples/fr.csv - check for duplicates in strings and in aliases (first column)\n' \ ) group = parser.add_argument_group('General arguments', 'arguments that applies to all platform independently') group.add_argument ('-a', '--action', required=False, default=Polyglot.Action.Generate, help='Supported actions {0}'.format(Polyglot.Actions)) group.add_argument ('-p', '--path', required=True, help='Path to csv file (or directory with csv files) that have specific format.' 'column1=key, column2=string, column3=translation ' 'column3=coma-separated list with platforms column4=comment') group.add_argument ('-pl', '--platform', nargs='+', default=[Polyglot.Platform.Any], required=False, help='List of platforms {0}. By default any'.format(Polyglot.SupportedPlatforms)) group.add_argument ('-m', '--master-dir', default=None, required=False, help='Enable "master" mode. Directory where the strings will be taken to override. It may be overwriten by strings from -p/--path') group.add_argument ('-o', '--output-dir', default='output', required=False, help='Directory where result will be placed'.format(Polyglot.SupportedPlatforms)) group.add_argument ('-ec', '--enable-comments', action='store_true', required=False, help='This flag help to generate comments column4=comment in csv. Disabled by default') group = parser.add_argument_group('Platform-specific arguments', 'arguments that applies to specified platform') group.add_argument ('-bbpkg', '--blackberry-package', default='com.PGLtd.strings', required=False, help='package name for blackberry rrh file. for more details see blackberry docs') args = parser.parse_args() try: Polyglot.init(args) if args.action == Polyglot.Action.Generate: g = Director(args.platform, args.enable_comments) g.process() elif args.action == Polyglot.Action.Analyze: a = Analyzer() a.process() elif args.action == Polyglot.Action.Deduplicate: s = Simplifier() s.process(args.path) pass else: parser.print_help() except Exception as e: print("Unexpected error '" + str(e) + "'") raise e