|
1
|
#!/usr/bin/python
|
|
2
3
4
5
6
7
|
# -*- coding:utf-8 -*-
'''
Created on 21 paź 2013
@author: mlenart
'''
|
|
8
|
import os
|
|
9
10
11
|
import sys
import logging
import codecs
|
|
12
13
14
|
from morfeuszbuilder.fsa import encode
from morfeuszbuilder.fsa import convertinput
from morfeuszbuilder.fsa.fsa import FSA
|
|
15
|
from morfeuszbuilder.fsa.serializer import Serializer, SerializationMethod
|
|
16
17
|
from morfeuszbuilder.tagset.tagset import Tagset
from morfeuszbuilder.segrules import rulesParser
|
|
18
|
from morfeuszbuilder.utils import exceptions, limits
|
|
19
20
|
from optparse import OptionParser
|
|
21
22
23
24
25
26
|
def _checkOption(opt, parser, msg):
if opt is None:
print >> sys.stderr, msg
parser.print_help()
exit(1)
|
|
27
28
|
def _checkCondition(cond, parser, msg):
if not cond:
|
|
29
30
31
32
|
print >> sys.stderr, msg
parser.print_help()
exit(1)
|
|
33
34
35
|
def _parseListCallback(option, opt, value, parser):
setattr(parser.values, option.dest, value.split(','))
|
|
36
37
38
39
|
def _checkOpen(filename, mode):
try:
with open(filename, mode) as _:
pass
|
|
40
41
|
if 'w' in mode:
os.remove(filename)
|
|
42
43
44
45
|
except IOError as ex:
print >> sys.stderr, str(ex)
exit(1)
|
|
46
47
48
49
50
|
def _getDictFilename(opts, isGenerator):
typeCode = 's' if isGenerator else 'a'
fname = '%s-%s.dict' % (opts.dictName, typeCode)
return os.path.join(opts.dictDir, fname)
|
|
51
|
def _parseOptions():
|
|
52
53
54
55
|
"""
Parses commandline args
"""
parser = OptionParser()
|
|
56
|
parser.add_option('--input-files',
|
|
57
58
59
60
61
|
type='string',
dest='inputFiles',
action='callback',
callback=_parseListCallback,
metavar='FILES',
|
|
62
|
help='comma separated list of dictionary files')
|
|
63
64
65
66
|
parser.add_option('--tagset-file',
dest='tagsetFile',
metavar='FILE',
help='path to the file with tagset')
|
|
67
68
69
70
|
parser.add_option('--segments-file',
dest='segmentsFile',
metavar='FILE',
help='path to the file with segment rules')
|
|
71
72
73
74
75
|
#~ parser.add_option('--trim-supneg',
#~ dest='trimSupneg',
#~ default=False,
#~ action='store_true',
#~ help='this option is ignored and exists only for backwards compatibility')
|
|
76
|
parser.add_option('--dict',
|
|
77
78
79
80
|
dest='dictName',
help='the name of result dictionary')
parser.add_option('--dict-dir',
dest='dictDir',
|
|
81
|
metavar='FILE',
|
|
82
83
84
85
|
default=os.getcwd(),
help='path to output directory (the default is current dir)')
parser.add_option('--only-analyzer',
dest='onlyAnalyzer',
|
|
86
87
|
action='store_true',
default=False,
|
|
88
89
90
|
help='Generate dictionary for morphological analysis only (default is both analysis and synthesis)')
parser.add_option('--only-generator',
dest='onlyGenerator',
|
|
91
92
|
action='store_true',
default=False,
|
|
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
help='Generate dictionary for morphological synthesis only (default is both analysis and synthesis)')
parser.add_option('--analyzer-cpp',
dest='analyzerCpp',
metavar='FILE',
help='Encode analyzer dictionary data in given c++ file')
parser.add_option('--generator-cpp',
dest='generatorCpp',
metavar='FILE',
help='Encode generator dictionary data in given c++ file')
#~ parser.add_option('--use-arrays',
#~ dest='useArrays',
#~ action='store_true',
#~ default=False,
#~ help='store states reachable by 2 transitions in arrays (should speed up recognition, available only when --serialization-method=V1)')
|
|
107
108
|
parser.add_option('--serialization-method',
dest='serializationMethod',
|
|
109
|
default='V1',
|
|
110
111
112
113
|
help="FSA serialization method: \
SIMPLE - fixed-length transitions, fastest and weakest compression \
V1 - variable-length transitions, compressed labels - strongest compression \
V2 - format similar to the default in Jan Daciuk's fsa package - variable-length transitions, non-compressed labels - good compression, good speed")
|
|
114
115
116
117
118
|
#~ parser.add_option('--visualize',
#~ dest='visualize',
#~ action='store_true',
#~ default=False,
#~ help='visualize result')
|
|
119
120
121
122
123
124
|
parser.add_option('--analyzer-train-file',
dest='analyzerTrainFile',
help='A text file used for analyzer training. Should contain words from some large corpus - one word in each line. Resulting analysis automaton should be faster with proper train file.')
parser.add_option('--generator-train-file',
dest='generatorTrainFile',
help='A text file used for generator training. Should contain words from some large corpus - one word in each line. Resulting synthesis automaton should be faster with proper train file.')
|
|
125
126
127
128
129
|
parser.add_option('--debug',
dest='debug',
action='store_true',
default=False,
help='output some debugging info')
|
|
130
131
132
133
134
|
#~ parser.add_option('--profile',
#~ dest='profile',
#~ action='store_true',
#~ default=False,
#~ help='show profiling graph (required pycallgraph and graphviz')
|
|
135
136
137
|
opts, args = parser.parse_args()
|
|
138
|
_checkOption(opts.inputFiles, parser, "Input file is missing")
|
|
139
140
141
142
143
144
145
146
147
148
|
_checkOption(opts.dictDir, parser, "Output dictionary dir is missing")
_checkCondition((opts.onlyAnalyzer, opts.onlyGenerator) != (True, True),
parser, 'Cannot set both --only-analyzer and --only-generator')
writeCpp = {opts.analyzerCpp, opts.generatorCpp} != {None}
_checkCondition(opts.dictName or writeCpp, parser, "Dictionary name is missing")
_checkCondition(opts.onlyGenerator or opts.analyzerCpp or not writeCpp, parser, "Analyzer .cpp output file path is missing")
_checkCondition(opts.onlyAnalyzer or opts.generatorCpp or not writeCpp, parser, "Generator .cpp output file path is missing")
#~ _checkCondition((opts.dictName, opts.outputCpp) != (None, None),
#~ parser, 'Must set at least one of: --dict-name, --output-cpp')
#~ _checkOption(opts.outputFile, parser, "Output file is missing")
|
|
149
|
_checkOption(opts.tagsetFile, parser, "Tagset file is missing")
|
|
150
151
152
153
|
_checkOption(opts.segmentsFile, parser, "Segmentation file is missing")
#~ _checkOption(opts.serializationMethod, parser, "Serialization method file is missing")
#~ _checkExactlyOneOptionSet([opts.analyzer, opts.generator],
#~ parser, 'Must set exactly one FSA type: --analyzer or --generator')
|
|
154
155
|
_checkOpen(opts.tagsetFile, 'r')
|
|
156
|
_checkOpen(opts.segmentsFile, 'r')
|
|
157
158
|
for filename in opts.inputFiles:
_checkOpen(filename, 'r')
|
|
159
160
161
162
|
if not opts.onlyGenerator:
_checkOpen(_getDictFilename(opts, isGenerator=False), 'w')
if not opts.onlyAnalyzer:
_checkOpen(_getDictFilename(opts, isGenerator=True), 'w')
|
|
163
|
|
|
164
165
|
if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1]:
print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')'
|
|
166
|
parser.print_help()
|
|
167
|
exit(1)
|
|
168
|
|
|
169
170
|
return opts
|
|
171
172
173
174
175
|
def _concatFiles(inputFiles):
for inputFile in inputFiles:
if inputFile:
with open(inputFile, 'r') as f:
for line in f:
|
|
176
|
if line and not ' ' in ''.join(line.split('\t')[:2]):
|
|
177
178
179
|
yield line
else:
logging.warn(u'Ignoring line: "%s" - contains space in text form or lemma' % line.strip().decode('utf8'))
|
|
180
|
|
|
181
182
|
def _readNamesAndQualifiers(inputFiles):
names = set([u''])
|
|
183
|
qualifiers = set([frozenset()])
|
|
184
185
186
187
188
|
for line in _concatFiles(inputFiles):
line = line.strip().decode('utf8')
if line:
_, _, _, name, qualifier = convertinput.parseLine(line)
names.add(name)
|
|
189
|
qualifiers.add(convertinput.parseQualifiers(qualifier))
|
|
190
|
namesMap = dict([(name, idx) for idx, name in enumerate(sorted(list(names)))])
|
|
191
|
qualifiersMap = dict([(quals, idx) for idx, quals in enumerate(sorted(qualifiers, key=lambda q: tuple(sorted(q))))])
|
|
192
193
194
195
196
197
198
|
exceptions.validate(
len(qualifiersMap) <= limits.MAX_QUALIFIERS_COMBINATIONS,
u'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS)
return namesMap, qualifiersMap
def _readPolimorfInput4Analyzer(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager):
|
|
199
|
logging.info('reading analyzer data from %s', str(inputFiles))
|
|
200
|
for entry in convertinput.PolimorfConverter4Analyzer(tagset, namesMap, qualifiersMap, encoder, 'utf8', segmentRulesManager).convert(_concatFiles(inputFiles)):
|
|
201
|
yield entry
|
|
202
|
|
|
203
|
def _readPolimorfInput4Generator(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager):
|
|
204
|
logging.info('reading generator data from %s', str(inputFiles))
|
|
205
|
for entry in convertinput.PolimorfConverter4Generator(tagset, namesMap, qualifiersMap, encoder, 'utf8', segmentRulesManager).convert(_concatFiles(inputFiles)):
|
|
206
|
yield entry
|
|
207
|
|
|
208
|
def _readTrainData(trainFile):
|
|
209
210
211
212
|
with codecs.open(trainFile, 'r', 'utf8') as f:
for line in f:
yield line.strip()
|
|
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
|
def _printStats(fsa):
acceptingNum = 0
sinkNum = 0
arrayNum = 0
for s in fsa.dfs():
if s.isAccepting():
acceptingNum += 1
if s.transitionsNum == 0:
sinkNum += 1
if s.serializeAsArray:
arrayNum += 1
logging.info('states num: '+str(fsa.getStatesNum()))
logging.info('transitions num: '+str(fsa.getTransitionsNum()))
logging.info('accepting states num: '+str(acceptingNum))
logging.info('sink states num: '+str(sinkNum))
logging.info('array states num: '+str(arrayNum))
|
|
230
|
def buildAnalyzerFromPoliMorf(inputFiles, tagset, namesMap, qualifiersMap, segmentRulesManager):
|
|
231
|
encoder = encode.MorphEncoder()
|
|
232
|
fsa = FSA(encoder, tagset)
|
|
233
|
for word, data in _readPolimorfInput4Analyzer(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager):
|
|
234
|
# print word, data
|
|
235
|
fsa.addEntry(word, data)
|
|
236
237
|
del word
del data
|
|
238
|
fsa.close()
|
|
239
240
241
|
logging.info('------')
logging.info('Analyzer FSA stats:')
logging.info('------')
|
|
242
|
_printStats(fsa)
|
|
243
|
return fsa
|
|
244
|
|
|
245
|
def buildGeneratorFromPoliMorf(inputFiles, tagset, namesMap, qualifiersMap, segmentRulesManager):
|
|
246
247
|
encoder = encode.Encoder4Generator()
fsa = FSA(encoder, tagset)
|
|
248
|
inputData = _readPolimorfInput4Generator(inputFiles, tagset, namesMap, qualifiersMap, encoder, segmentRulesManager)
|
|
249
250
251
|
for word, data in inputData:
fsa.addEntry(word, data)
fsa.close()
|
|
252
253
254
|
logging.info('------')
logging.info('Generator FSA stats:')
logging.info('------')
|
|
255
|
_printStats(fsa)
|
|
256
|
return fsa
|
|
257
|
|
|
258
259
260
|
def _doBuildDictionaryPart(opts, tagset, namesMap, qualifiersMap, isGenerator):
logging.info('reading segmentation rules')
|
|
261
|
rulesParserVersion = rulesParser.RulesParser.PARSE4ANALYZER if not isGenerator else rulesParser.RulesParser.PARSE4GENERATOR
|
|
262
|
segmentRulesManager = rulesParser.RulesParser(tagset, namesMap, qualifiersMap, rulesParserVersion).parse(opts.segmentsFile)
|
|
263
|
segmentationRulesData = segmentRulesManager.serialize()
|
|
264
265
266
267
268
269
|
logging.info('done reading segmentation rules')
logging.info('building automaton')
buildFunction = buildAnalyzerFromPoliMorf if not isGenerator else buildGeneratorFromPoliMorf
fsa = buildFunction(opts.inputFiles, tagset, namesMap, qualifiersMap, segmentRulesManager)
logging.info('done building automaton')
|
|
270
|
|
|
271
272
|
if not isGenerator and opts.analyzerTrainFile:
logging.info('training analyzer automaton with '+opts.analyzerTrainFile+' ...')
|
|
273
274
275
|
fsa.train(_readTrainData(opts.analyzerTrainFile))
logging.info('done training')
|
|
276
277
278
279
280
281
|
if isGenerator and opts.generatorTrainFile:
logging.info('training generator automaton with '+opts.generatorTrainFile+' ...')
fsa.train(_readTrainData(opts.analyzerTrainFile))
logging.info('done training')
serializer = Serializer.getSerializer(opts.serializationMethod, fsa, tagset, namesMap, qualifiersMap, segmentationRulesData)
|
|
282
283
284
285
286
287
288
289
290
291
|
if opts.generatorCpp and isGenerator:
serializer.serialize2CppFile(opts.generatorCpp, isGenerator=isGenerator)
if opts.analyzerCpp and not isGenerator:
serializer.serialize2CppFile(opts.analyzerCpp, isGenerator=isGenerator)
if opts.dictDir:
serializer.serialize2BinaryFile(_getDictFilename(opts, isGenerator=isGenerator), isGenerator=isGenerator)
logging.info('total FSA size (in bytes): '+str(fsa.initialState.reverseOffset))
|
|
292
|
def main(opts):
|
|
293
294
295
296
|
if opts.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
|
|
297
|
|
|
298
|
logging.info('reading tagset')
|
|
299
|
tagset = Tagset(opts.tagsetFile)
|
|
300
301
302
303
304
|
logging.info('done reading tagset')
logging.info('reading names and qualifiers')
namesMap, qualifiersMap = _readNamesAndQualifiers(opts.inputFiles)
logging.info('done reading names and qualifiers')
|
|
305
|
|
|
306
|
if not opts.onlyGenerator:
|
|
307
|
_doBuildDictionaryPart(opts, tagset, namesMap, qualifiersMap, isGenerator=False)
|
|
308
|
|
|
309
310
|
if not opts.onlyAnalyzer:
_doBuildDictionaryPart(opts, tagset, namesMap, qualifiersMap, isGenerator=True)
|
|
311
|
|
|
312
|
if __name__ == '__main__':
|
|
313
|
import os
|
|
314
|
opts = _parseOptions()
|
|
315
316
317
318
319
320
321
|
#~ try:
main(opts)
#~ except Exception as ex:
#~ print >> sys.stderr, u'Building dictionary file failed:', unicode(ex).encode('utf8'), 'type of error:', type(ex)
#~ sys.exit(1)
#~ finally:
#~ pass
|
|
322
|
|