Blame view

fsabuilder/morfeusz_builder 14.3 KB
Michał Lenart authored
1
#!/usr/bin/python
Michał Lenart authored
2
3
4
5
6
7
# -*- coding:utf-8 -*-
'''
Created on 21 paź 2013

@author: mlenart
'''
Michał Lenart authored
8
import os
Michał Lenart authored
9
10
11
import sys
import logging
import codecs
Michał Lenart authored
12
13
14
from morfeuszbuilder.fsa import encode
from morfeuszbuilder.fsa import convertinput
from morfeuszbuilder.fsa.fsa import FSA
Michał Lenart authored
15
from morfeuszbuilder.fsa.serializer import Serializer, SerializationMethod
Michał Lenart authored
16
17
from morfeuszbuilder.tagset.tagset import Tagset
from morfeuszbuilder.segrules import rulesParser
Michał Lenart authored
18
from morfeuszbuilder.utils import exceptions, limits
Michał Lenart authored
19
20
from optparse import OptionParser
Michał Lenart authored
21
22
23
24
25
26
def _checkOption(opt, parser, msg):
    if opt is None:
        print >> sys.stderr, msg
        parser.print_help()
        exit(1)
Michał Lenart authored
27
28
def _checkCondition(cond, parser, msg):
    if not cond:
Michał Lenart authored
29
30
31
32
        print >> sys.stderr, msg
        parser.print_help()
        exit(1)
Michał Lenart authored
33
34
35
def _parseListCallback(option, opt, value, parser):
    setattr(parser.values, option.dest, value.split(','))
Michał Lenart authored
36
37
38
39
def _checkOpen(filename, mode):
    try:
        with open(filename, mode) as _:
            pass
Michał Lenart authored
40
41
        if 'w' in mode:
            os.remove(filename)
Michał Lenart authored
42
43
44
45
    except IOError as ex:
        print >> sys.stderr, str(ex)
        exit(1)
Michał Lenart authored
46
47
48
49
50
def _getDictFilename(opts, isGenerator):
    typeCode = 's' if isGenerator else 'a'
    fname = '%s-%s.dict' % (opts.dictName, typeCode)
    return os.path.join(opts.dictDir, fname)
Michał Lenart authored
51
def _parseOptions():
Michał Lenart authored
52
53
54
55
    """
    Parses commandline args
    """
    parser = OptionParser()
Michał Lenart authored
56
    parser.add_option('--input-files',
Michał Lenart authored
57
58
59
60
61
                        type='string',
                        dest='inputFiles',
                        action='callback',
                        callback=_parseListCallback,
                        metavar='FILES',
Michał Lenart authored
62
                        help='comma separated list of dictionary files')
Michał Lenart authored
63
64
65
66
    parser.add_option('--tagset-file',
                        dest='tagsetFile',
                        metavar='FILE',
                        help='path to the file with tagset')
Michał Lenart authored
67
68
69
70
    parser.add_option('--segments-file',
                        dest='segmentsFile',
                        metavar='FILE',
                        help='path to the file with segment rules')
Michał Lenart authored
71
72
73
74
75
    #~ parser.add_option('--trim-supneg',
                        #~ dest='trimSupneg',
                        #~ default=False,
                        #~ action='store_true',
                        #~ help='this option is ignored and exists only for backwards compatibility')
Michał Lenart authored
76
    parser.add_option('--dict',
Michał Lenart authored
77
78
79
80
                        dest='dictName',
                        help='the name of result dictionary')
    parser.add_option('--dict-dir',
                        dest='dictDir',
Michał Lenart authored
81
                        metavar='FILE',
Michał Lenart authored
82
83
84
85
                        default=os.getcwd(),
                        help='path to output directory (the default is current dir)')
    parser.add_option('--only-analyzer',
                        dest='onlyAnalyzer',
Michał Lenart authored
86
87
                        action='store_true',
                        default=False,
Michał Lenart authored
88
89
90
                        help='Generate dictionary for morphological analysis only (default is both analysis and synthesis)')
    parser.add_option('--only-generator',
                        dest='onlyGenerator',
Michał Lenart authored
91
92
                        action='store_true',
                        default=False,
Michał Lenart authored
93
94
95
96
97
98
99
100
101
102
103
104
105
106
                        help='Generate dictionary for morphological synthesis only (default is both analysis and synthesis)')
    parser.add_option('--analyzer-cpp',
                        dest='analyzerCpp',
                        metavar='FILE',
                        help='Encode analyzer dictionary data in given c++ file')
    parser.add_option('--generator-cpp',
                        dest='generatorCpp',
                        metavar='FILE',
                        help='Encode generator dictionary data in given c++ file')
    #~ parser.add_option('--use-arrays',
                        #~ dest='useArrays',
                        #~ action='store_true',
                        #~ default=False,
                        #~ help='store states reachable by 2 transitions in arrays (should speed up recognition, available only when --serialization-method=V1)')
Michał Lenart authored
107
108
    parser.add_option('--serialization-method',
                        dest='serializationMethod',
Michał Lenart authored
109
                        default='V1',
Michał Lenart authored
110
111
112
113
                        help="FSA serialization method: \
                        SIMPLE - fixed-length transitions, fastest and weakest compression \
                        V1 - variable-length transitions, compressed labels - strongest compression \
                        V2 - format similar to the default in Jan Daciuk's fsa package - variable-length transitions, non-compressed labels - good compression, good speed")
Michał Lenart authored
114
115
116
117
118
    #~ parser.add_option('--visualize',
                        #~ dest='visualize',
                        #~ action='store_true', 
                        #~ default=False,
                        #~ help='visualize result')
Michał Lenart authored
119
120
121
122
123
124
    parser.add_option('--analyzer-train-file',
                        dest='analyzerTrainFile',
                        help='A text file used for analyzer training. Should contain words from some large corpus - one word in each line. Resulting analysis automaton should be faster with proper train file.')
    parser.add_option('--generator-train-file',
                        dest='generatorTrainFile',
                        help='A text file used for generator training. Should contain words from some large corpus - one word in each line. Resulting synthesis automaton should be faster with proper train file.')
Michał Lenart authored
125
126
127
128
129
    parser.add_option('--debug',
                        dest='debug',
                        action='store_true',
                        default=False,
                        help='output some debugging info')
Michał Lenart authored
130
131
132
133
134
    #~ parser.add_option('--profile',
                        #~ dest='profile',
                        #~ action='store_true',
                        #~ default=False,
                        #~ help='show profiling graph (required pycallgraph and graphviz')
Michał Lenart authored
135
136
137

    opts, args = parser.parse_args()
Michał Lenart authored
138
    _checkOption(opts.inputFiles, parser, "Input file is missing")
Michał Lenart authored
139
140
141
142
143
144
145
146
147
148
    _checkOption(opts.dictDir, parser, "Output dictionary dir is missing")
    _checkCondition((opts.onlyAnalyzer, opts.onlyGenerator) != (True, True), 
                              parser, 'Cannot set both --only-analyzer and --only-generator')
    writeCpp = {opts.analyzerCpp, opts.generatorCpp} != {None}
    _checkCondition(opts.dictName or writeCpp, parser, "Dictionary name is missing")
    _checkCondition(opts.onlyGenerator or opts.analyzerCpp or not writeCpp, parser, "Analyzer .cpp output file path is missing")
    _checkCondition(opts.onlyAnalyzer or opts.generatorCpp or not writeCpp, parser, "Generator .cpp output file path is missing")
    #~ _checkCondition((opts.dictName, opts.outputCpp) != (None, None), 
                              #~ parser, 'Must set at least one of: --dict-name, --output-cpp')
    #~ _checkOption(opts.outputFile, parser, "Output file is missing")
Michał Lenart authored
149
    _checkOption(opts.tagsetFile, parser, "Tagset file is missing")
Michał Lenart authored
150
151
152
153
    _checkOption(opts.segmentsFile, parser, "Segmentation file is missing")
    #~ _checkOption(opts.serializationMethod, parser, "Serialization method file is missing")
    #~ _checkExactlyOneOptionSet([opts.analyzer, opts.generator], 
                              #~ parser, 'Must set exactly one FSA type: --analyzer or --generator')
Michał Lenart authored
154
155

    _checkOpen(opts.tagsetFile, 'r')
Michał Lenart authored
156
    _checkOpen(opts.segmentsFile, 'r')
Michał Lenart authored
157
158
    for filename in opts.inputFiles:
        _checkOpen(filename, 'r')
Michał Lenart authored
159
160
161
162
    if not opts.onlyGenerator:
        _checkOpen(_getDictFilename(opts, isGenerator=False), 'w')
    if not opts.onlyAnalyzer:
        _checkOpen(_getDictFilename(opts, isGenerator=True), 'w')
Michał Lenart authored
163
Michał Lenart authored
164
165
    if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1]:
        print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')'
Michał Lenart authored
166
        parser.print_help()
Michał Lenart authored
167
        exit(1)
Michał Lenart authored
168
Michał Lenart authored
169
170
    return opts
Michał Lenart authored
171
172
173
174
175
def _concatFiles(inputFiles):
    for inputFile in inputFiles:
        if inputFile:
            with open(inputFile, 'r') as f:
                for line in f:
Michał Lenart authored
176
                    if line and not ' ' in ''.join(line.split('\t')[:2]):
Michał Lenart authored
177
178
179
                        yield line
                    else:
                        logging.warn(u'Ignoring line: "%s" - contains space in text form or lemma' % line.strip().decode('utf8'))
Michał Lenart authored
180
Michał Lenart authored
181
182
def _readNamesAndQualifiers(inputFiles):
    names = set([u''])
Michał Lenart authored
183
    qualifiers = set([frozenset()])
Michał Lenart authored
184
185
186
187
188
    for line in _concatFiles(inputFiles):
        line = line.strip().decode('utf8')
        if line:
            _, _, _, name, qualifier = convertinput.parseLine(line)
            names.add(name)
Michał Lenart authored
189
            qualifiers.add(convertinput.parseQualifiers(qualifier))
Michał Lenart authored
190
    namesMap = dict([(name, idx) for idx, name in enumerate(sorted(list(names)))])
Michał Lenart authored
191
    qualifiersMap = dict([(quals, idx) for idx, quals in enumerate(sorted(qualifiers, key=lambda q: tuple(sorted(q))))])
Michał Lenart authored
192
193
194
195
196
197
198
    exceptions.validate(
                    len(qualifiersMap) <= limits.MAX_QUALIFIERS_COMBINATIONS, 
                    u'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS)

    return namesMap, qualifiersMap

def _readPolimorfInput4Analyzer(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager):
Michał Lenart authored
199
    logging.info('reading analyzer data from %s', str(inputFiles))
Michał Lenart authored
200
    for entry in convertinput.PolimorfConverter4Analyzer(tagset, namesMap, qualifiersMap, encoder, 'utf8', segmentRulesManager).convert(_concatFiles(inputFiles)):
Michał Lenart authored
201
        yield entry
Michał Lenart authored
202
Michał Lenart authored
203
def _readPolimorfInput4Generator(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager):
Michał Lenart authored
204
    logging.info('reading generator data from %s', str(inputFiles))
Michał Lenart authored
205
    for entry in convertinput.PolimorfConverter4Generator(tagset, namesMap, qualifiersMap, encoder, 'utf8', segmentRulesManager).convert(_concatFiles(inputFiles)):
Michał Lenart authored
206
        yield entry
Michał Lenart authored
207
Michał Lenart authored
208
def _readTrainData(trainFile):
Michał Lenart authored
209
210
211
212
    with codecs.open(trainFile, 'r', 'utf8') as f:
        for line in f:
            yield line.strip()
Michał Lenart authored
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def _printStats(fsa):
    acceptingNum = 0
    sinkNum = 0
    arrayNum = 0
    for s in fsa.dfs():
        if s.isAccepting():
            acceptingNum += 1
        if s.transitionsNum == 0:
            sinkNum += 1
        if s.serializeAsArray:
            arrayNum += 1
    logging.info('states num: '+str(fsa.getStatesNum()))
    logging.info('transitions num: '+str(fsa.getTransitionsNum()))
    logging.info('accepting states num: '+str(acceptingNum))
    logging.info('sink states num: '+str(sinkNum))
    logging.info('array states num: '+str(arrayNum))
Michał Lenart authored
230
def buildAnalyzerFromPoliMorf(inputFiles, tagset, namesMap, qualifiersMap, segmentRulesManager):
Michał Lenart authored
231
    encoder = encode.MorphEncoder()
Michał Lenart authored
232
    fsa = FSA(encoder, tagset)
Michał Lenart authored
233
    for word, data in _readPolimorfInput4Analyzer(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager):
Michał Lenart authored
234
#         print word, data
Michał Lenart authored
235
        fsa.addEntry(word, data)
Michał Lenart authored
236
237
        del word
        del data
Michał Lenart authored
238
    fsa.close()
Michał Lenart authored
239
240
241
    logging.info('------')
    logging.info('Analyzer FSA stats:')
    logging.info('------')
Michał Lenart authored
242
    _printStats(fsa)
Michał Lenart authored
243
    return fsa
Michał Lenart authored
244
Michał Lenart authored
245
def buildGeneratorFromPoliMorf(inputFiles, tagset, namesMap, qualifiersMap, segmentRulesManager):
Michał Lenart authored
246
247
    encoder = encode.Encoder4Generator()
    fsa = FSA(encoder, tagset)
Michał Lenart authored
248
    inputData = _readPolimorfInput4Generator(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager)
Michał Lenart authored
249
250
251
    for word, data in inputData:
        fsa.addEntry(word, data)
    fsa.close()
Michał Lenart authored
252
253
254
    logging.info('------')
    logging.info('Generator FSA stats:')
    logging.info('------')
Michał Lenart authored
255
    _printStats(fsa)
Michał Lenart authored
256
    return fsa
Michał Lenart authored
257
Michał Lenart authored
258
259
260
def _doBuildDictionaryPart(opts, tagset, namesMap, qualifiersMap, isGenerator):

    logging.info('reading segmentation rules')
Michał Lenart authored
261
    rulesParserVersion = rulesParser.RulesParser.PARSE4ANALYZER if not isGenerator else rulesParser.RulesParser.PARSE4GENERATOR
Michał Lenart authored
262
    segmentRulesManager = rulesParser.RulesParser(tagset, namesMap, qualifiersMap, rulesParserVersion).parse(opts.segmentsFile)
Michał Lenart authored
263
    segmentationRulesData = segmentRulesManager.serialize()
Michał Lenart authored
264
265
266
267
268
269
    logging.info('done reading segmentation rules')

    logging.info('building automaton')
    buildFunction = buildAnalyzerFromPoliMorf if not isGenerator else buildGeneratorFromPoliMorf
    fsa = buildFunction(opts.inputFiles, tagset, namesMap, qualifiersMap, segmentRulesManager)
    logging.info('done building automaton')
Michał Lenart authored
270
Michał Lenart authored
271
272
    if not isGenerator and opts.analyzerTrainFile:
        logging.info('training analyzer automaton with '+opts.analyzerTrainFile+' ...')
Michał Lenart authored
273
274
275
        fsa.train(_readTrainData(opts.analyzerTrainFile))
        logging.info('done training')
Michał Lenart authored
276
277
278
279
280
281
    if isGenerator and opts.generatorTrainFile:
        logging.info('training generator automaton with '+opts.generatorTrainFile+' ...')
        fsa.train(_readTrainData(opts.analyzerTrainFile))
        logging.info('done training')

    serializer = Serializer.getSerializer(opts.serializationMethod, fsa, tagset, namesMap, qualifiersMap, segmentationRulesData)
Michał Lenart authored
282
283
284
285
286
287
288
289
290
291
    if opts.generatorCpp and isGenerator:
        serializer.serialize2CppFile(opts.generatorCpp, isGenerator=isGenerator)
    if opts.analyzerCpp and not isGenerator:
        serializer.serialize2CppFile(opts.analyzerCpp, isGenerator=isGenerator)

    if opts.dictDir:
        serializer.serialize2BinaryFile(_getDictFilename(opts, isGenerator=isGenerator), isGenerator=isGenerator)

    logging.info('total FSA size (in bytes): '+str(fsa.initialState.reverseOffset))
Michał Lenart authored
292
def main(opts):
Michał Lenart authored
293
294
295
296
    if opts.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
Michał Lenart authored
297
Michał Lenart authored
298
    logging.info('reading tagset')
Michał Lenart authored
299
    tagset = Tagset(opts.tagsetFile)
Michał Lenart authored
300
301
302
303
304
    logging.info('done reading tagset')

    logging.info('reading names and qualifiers')
    namesMap, qualifiersMap = _readNamesAndQualifiers(opts.inputFiles)
    logging.info('done reading names and qualifiers')
Michał Lenart authored
305
Michał Lenart authored
306
    if not opts.onlyGenerator:
Michał Lenart authored
307
        _doBuildDictionaryPart(opts, tagset, namesMap, qualifiersMap, isGenerator=False)
Michał Lenart authored
308
Michał Lenart authored
309
310
    if not opts.onlyAnalyzer:
        _doBuildDictionaryPart(opts, tagset, namesMap, qualifiersMap, isGenerator=True)
Michał Lenart authored
311
Michał Lenart authored
312
if __name__ == '__main__':
Michał Lenart authored
313
    import os
Michał Lenart authored
314
    opts = _parseOptions()
Michał Lenart authored
315
316
317
318
319
320
321
    #~ try:
    main(opts)
    #~ except Exception as ex:
        #~ print >> sys.stderr, u'Building dictionary file failed:', unicode(ex).encode('utf8'), 'type of error:', type(ex)
        #~ sys.exit(1)
    #~ finally:
        #~ pass
Michał Lenart authored
322