generate.py 4.42 KB
'''
Created on Nov 22, 2013

@author: mlenart
'''

import os
import sys
import logging
from optparse import OptionParser

ARRAY_SIZE = 0x4000

def _parseOptions():
    """
    Parses commandline args
    """
    parser = OptionParser()
#     parser.add_option('-i', '--input-file',
#                         dest='inputFile',
#                         metavar='FILE',
#                         help='path to input file (CaseFolding.txt)')
    parser.add_option('-o', '--output-file',
                        dest='outputFile',
                        metavar='FILE',
                        help='path to output C++ source file')
#     parser.add_option('--header-filename', 
#                         dest='headerFilename',
#                         help='name of the C++ header file')
    
    
    opts, args = parser.parse_args()
    
    if None in [opts.outputFile, opts.constName, opts.headerFilename]:
        logging.error('Missing some options')
        parser.print_help()
        exit(1)
    return opts

def _parseCaseFoldingTxtFile(f):
    table = [code for code in range(ARRAY_SIZE)]
    extendedTable = {}
    for line in f:
        line = line.strip()
        if line and not line.startswith('#'):
            split = line.split('; ')
            code = int(split[0], 16)
            if split[1] in 'CS':
                targetCode = int(split[2], 16)
                if code < ARRAY_SIZE:
                    table[code] = targetCode
                else:
                    extendedTable[code] = targetCode
    return table, extendedTable

def _parseUnicodeDataTxtFile(f):
    toLowerTable = [code for code in range(ARRAY_SIZE)]
    extToLowerTable = {}
    toTitleTable = [code for code in range(ARRAY_SIZE)]
    extToTitleTable = {}
    for line in f:
        line = line.strip()
        if line and not line.startswith('#'):
            split = line.split(';')
            code = int(split[0], 16)
            lowercaseCode = int(split[13], 16) if split[13] else code
            titlecaseCode = int(split[14], 16) if split[14] else code
            if lowercaseCode != code:
                if code < ARRAY_SIZE:
                    toLowerTable[code] = lowercaseCode
                else:
                    extToLowerTable[code] = lowercaseCode
            if titlecaseCode != code:
                if code < ARRAY_SIZE:
                    toTitleTable[code] = titlecaseCode
                else:
                    extToTitleTable[code] = titlecaseCode
    return toLowerTable, extToLowerTable, toTitleTable, extToTitleTable
    

def _serializeTable(table):
    res = []
    res.append('{')
    for targetCode in table:
        res.append(str(targetCode))
        res.append(',')
    res.append('}')
    return ''.join(res)

def _serializeExtendedTable(table):
    res = []
    res.append('{')
    for code, targetCode in table.iteritems():
        res.append('{')
        res.append(str(code))
        res.append(',')
        res.append(str(targetCode))
        res.append('},')
    res.append('}')
    return ''.join(res)

def _serialize(toLowerTable, extToLowerTable, toTitleTable, extToTitleTable):
    return '''
#include "case_folding.hpp"

const unsigned int TO_LOWERCASE_TABLE_SIZE = {toLowerTableSize};
const unsigned int EXT_TO_LOWERCASE_TABLE_SIZE = {extToLowerTableSize};
const uint32_t TO_LOWERCASE_TABLE[] = {toLowerTable};
const uint32_t EXT_TO_LOWERCASE_TABLE[][2] = {extToLowerTable};

const unsigned int TO_TITLECASE_TABLE_SIZE = {toTitleTableSize};
const unsigned int EXT_TO_TITLECASE_TABLE_SIZE = {extToTitleTableSize};
const uint32_t TO_TITLECASE_TABLE[] = {toTitleTable};
const uint32_t EXT_TO_TITLECASE_TABLE[][2] = {extToTitleTable};
'''.format(
           toLowerTableSize=len(toLowerTable), 
           toLowerTable=_serializeTable(toLowerTable),
           extToLowerTableSize=len(extToLowerTable),
           extToLowerTable=_serializeExtendedTable(extToLowerTable),
           toTitleTableSize=len(toTitleTable),
           toTitleTable=_serializeTable(toTitleTable),
           extToTitleTableSize=len(extToTitleTable),
           extToTitleTable=_serializeExtendedTable(extToTitleTable))

if __name__ == '__main__':
    outfile = sys.argv[1]
    with open(os.path.join(os.path.dirname(__file__), 'UnicodeData.txt'), 'r') as f:
        toLowerTable, extToLowerTable, toTitleTable, extToTitleTable = _parseUnicodeDataTxtFile(f)
        with open(sys.argv[1], 'w') as f1:
            f1.write(_serialize(toLowerTable, extToLowerTable, toTitleTable, extToTitleTable))