|
1
|
#!/usr/bin/python
|
|
2
3
4
5
6
7
|
# -*- coding:utf-8 -*-
'''
Created on 21 paź 2013
@author: mlenart
'''
|
|
8
|
import os
|
|
9
10
11
|
import sys
import logging
import codecs
|
|
12
13
14
15
|
from morfeuszbuilder.fsa import encode
from morfeuszbuilder.fsa import convertinput
from morfeuszbuilder.fsa.fsa import FSA
from morfeuszbuilder.fsa.serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer
|
|
16
17
|
from morfeuszbuilder.tagset.tagset import Tagset
from morfeuszbuilder.segrules import rulesParser
|
|
18
19
|
from optparse import OptionParser
|
|
20
21
22
23
|
# class InputFormat():
# ENCODED = 'ENCODED'
# POLIMORF = 'POLIMORF'
# PLAIN = 'PLAIN'
|
|
24
|
|
|
25
26
27
28
29
|
class SerializationMethod():
SIMPLE = 'SIMPLE'
V1 = 'V1'
V2 = 'V2'
|
|
30
31
32
33
34
35
36
37
38
39
40
41
|
def _checkOption(opt, parser, msg):
if opt is None:
print >> sys.stderr, msg
parser.print_help()
exit(1)
def _checkExactlyOneOptionSet(optsList, parser, msg):
if optsList.count(True) != 1:
print >> sys.stderr, msg
parser.print_help()
exit(1)
|
|
42
43
44
|
def _parseListCallback(option, opt, value, parser):
setattr(parser.values, option.dest, value.split(','))
|
|
45
46
47
48
49
50
51
52
|
def _checkOpen(filename, mode):
try:
with open(filename, mode) as _:
pass
except IOError as ex:
print >> sys.stderr, str(ex)
exit(1)
|
|
53
|
def _parseOptions():
|
|
54
55
56
57
|
"""
Parses commandline args
"""
parser = OptionParser()
|
|
58
|
parser.add_option('--input-files',
|
|
59
60
61
62
63
64
|
type='string',
dest='inputFiles',
action='callback',
callback=_parseListCallback,
metavar='FILES',
help='comma separated list of files')
|
|
65
66
67
68
|
parser.add_option('--tagset-file',
dest='tagsetFile',
metavar='FILE',
help='path to the file with tagset')
|
|
69
70
71
72
|
parser.add_option('--segments-file',
dest='segmentsFile',
metavar='FILE',
help='path to the file with segment rules')
|
|
73
74
75
76
77
|
parser.add_option('--trim-supneg',
dest='trimSupneg',
default=False,
action='store_true',
help='trim "naj" and "nie" prefixes from words tagged as "%:sup" and "%:neg" respectively. Valid only for analysis.')
|
|
78
79
80
81
|
parser.add_option('-o', '--output-file',
dest='outputFile',
metavar='FILE',
help='path to output file')
|
|
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
parser.add_option('-a', '--analyzer',
dest='analyzer',
action='store_true',
default=False,
help='Generate FSA for morphological analysis')
parser.add_option('-g', '--generator',
dest='generator',
action='store_true',
default=False,
help='Generate FSA for morphological synthesis')
parser.add_option('--cpp',
dest='cpp',
action='store_true',
default=False,
help='Encode binary data in c++ file')
|
|
97
98
99
100
|
parser.add_option('--use-arrays',
dest='useArrays',
action='store_true',
default=False,
|
|
101
102
103
104
105
106
107
|
help='store states reachable by 2 transitions in arrays (should speed up recognition, available only when --serialization-method=V1)')
parser.add_option('--serialization-method',
dest='serializationMethod',
help="FSA serialization method: \
SIMPLE - fixed-length transitions, fastest and weakest compression \
V1 - variable-length transitions, compressed labels - strongest compression \
V2 - format similar to the default in Jan Daciuk's fsa package - variable-length transitions, non-compressed labels - good compression, good speed")
|
|
108
109
110
111
112
|
#~ parser.add_option('--visualize',
#~ dest='visualize',
#~ action='store_true',
#~ default=False,
#~ help='visualize result')
|
|
113
114
115
|
parser.add_option('--train-file',
dest='trainFile',
help='A text file used for training. Should contain words from some large corpus - one word in each line')
|
|
116
117
118
119
120
|
parser.add_option('--debug',
dest='debug',
action='store_true',
default=False,
help='output some debugging info')
|
|
121
122
123
124
125
|
#~ parser.add_option('--profile',
#~ dest='profile',
#~ action='store_true',
#~ default=False,
#~ help='show profiling graph (required pycallgraph and graphviz')
|
|
126
127
128
|
opts, args = parser.parse_args()
|
|
129
|
_checkOption(opts.inputFiles, parser, "Input file is missing")
|
|
130
131
132
133
134
|
_checkOption(opts.outputFile, parser, "Output file is missing")
_checkOption(opts.tagsetFile, parser, "Tagset file is missing")
_checkOption(opts.serializationMethod, parser, "Serialization method file is missing")
_checkExactlyOneOptionSet([opts.analyzer, opts.generator],
parser, 'Must set exactly one FSA type: --analyzer or --generator')
|
|
135
136
|
_checkOpen(opts.tagsetFile, 'r')
|
|
137
138
139
|
for filename in opts.inputFiles:
_checkOpen(filename, 'r')
_checkOpen(opts.outputFile, 'w')
|
|
140
141
|
if opts.analyzer:
_checkOption(opts.segmentsFile, parser, "Segment rules file is missing")
|
|
142
|
_checkOpen(opts.segmentsFile, 'r')
|
|
143
144
|
if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2]:
|
|
145
|
print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2])+')'
|
|
146
|
parser.print_help()
|
|
147
|
exit(1)
|
|
148
|
|
|
149
150
|
return opts
|
|
151
|
def _concatFiles(inputFiles):
|
|
152
|
# return open(inputFiles[0], 'r')
|
|
153
154
155
156
157
158
159
160
161
162
|
for inputFile in inputFiles:
if inputFile:
with open(inputFile, 'r') as f:
for line in f:
yield line
def _readPolimorfInput4Analyzer(inputFiles, tagset, encoder, segmentRulesManager, trimSupneg):
logging.info('reading analyzer data from %s', str(inputFiles))
for entry in convertinput.PolimorfConverter4Analyzer(tagset, encoder, 'utf8', segmentRulesManager, trimSupneg).convert(_concatFiles(inputFiles)):
yield entry
|
|
163
|
|
|
164
165
166
167
|
def _readPolimorfInput4Generator(inputFiles, tagset, encoder):
logging.info('reading generator data from %s', str(inputFiles))
for entry in convertinput.PolimorfConverter4Generator(tagset, encoder, 'utf8').convert(_concatFiles(inputFiles)):
yield entry
|
|
168
|
|
|
169
|
def _readTrainData(trainFile):
|
|
170
171
172
173
|
with codecs.open(trainFile, 'r', 'utf8') as f:
for line in f:
yield line.strip()
|
|
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
|
def _printStats(fsa):
acceptingNum = 0
sinkNum = 0
arrayNum = 0
for s in fsa.dfs():
if s.isAccepting():
acceptingNum += 1
if s.transitionsNum == 0:
sinkNum += 1
if s.serializeAsArray:
arrayNum += 1
logging.info('states num: '+str(fsa.getStatesNum()))
logging.info('transitions num: '+str(fsa.getTransitionsNum()))
logging.info('accepting states num: '+str(acceptingNum))
logging.info('sink states num: '+str(sinkNum))
logging.info('array states num: '+str(arrayNum))
|
|
191
|
def buildAnalyzerFromPoliMorf(inputFiles, tagset, segmentRulesManager, trimSupneg):
|
|
192
|
encoder = encode.MorphEncoder()
|
|
193
|
fsa = FSA(encoder, tagset)
|
|
194
195
|
for word, data in _readPolimorfInput4Analyzer(inputFiles, tagset, encoder, segmentRulesManager, trimSupneg):
# print word, data
|
|
196
197
|
fsa.addEntry(word, data)
fsa.close()
|
|
198
199
200
|
logging.info('------')
logging.info('Analyzer FSA stats:')
logging.info('------')
|
|
201
202
203
|
_printStats(fsa)
return fsa
|
|
204
|
def buildGeneratorFromPoliMorf(inputFiles, tagset):
|
|
205
206
|
encoder = encode.Encoder4Generator()
fsa = FSA(encoder, tagset)
|
|
207
|
inputData = _readPolimorfInput4Generator(inputFiles, tagset, encoder)
|
|
208
209
210
|
for word, data in inputData:
fsa.addEntry(word, data)
fsa.close()
|
|
211
212
213
|
logging.info('------')
logging.info('Generator FSA stats:')
logging.info('------')
|
|
214
215
|
_printStats(fsa)
return fsa
|
|
216
|
|
|
217
|
def main(opts):
|
|
218
219
220
221
|
if opts.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
|
|
222
|
|
|
223
224
225
226
227
228
|
if opts.analyzer:
logging.info('*** building analyzer ***')
else:
logging.info('*** building generator ***')
logging.info('reading tagset from %s', opts.tagsetFile)
|
|
229
230
|
tagset = Tagset(opts.tagsetFile)
|
|
231
|
if opts.analyzer:
|
|
232
233
|
segmentRulesManager = rulesParser.RulesParser(tagset).parse(opts.segmentsFile)
additionalData = segmentRulesManager.serialize()
|
|
234
|
fsa = buildAnalyzerFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager, opts.trimSupneg)
|
|
235
|
else:
|
|
236
|
fsa = buildGeneratorFromPoliMorf(opts.inputFiles, tagset)
|
|
237
|
additionalData = bytearray()
|
|
238
|
|
|
239
240
|
if opts.trainFile:
logging.info('training with '+opts.trainFile+' ...')
|
|
241
|
fsa.train(_readTrainData(opts.trainFile))
|
|
242
|
logging.info('done training')
|
|
243
244
245
246
247
248
|
serializer = {
SerializationMethod.SIMPLE: SimpleSerializer,
SerializationMethod.V1: VLengthSerializer1,
SerializationMethod.V2: VLengthSerializer2,
}[opts.serializationMethod](fsa)
|
|
249
250
|
if opts.cpp:
|
|
251
|
serializer.serialize2CppFile(opts.outputFile, generator=opts.generator, additionalData=additionalData)
|
|
252
|
else:
|
|
253
|
serializer.serialize2BinaryFile(opts.outputFile, additionalData=additionalData)
|
|
254
255
|
logging.info('total FSA size (in bytes): '+str(fsa.initialState.reverseOffset))
|
|
256
257
258
259
|
# {
# OutputFormat.CPP: serializer.serialize2CppFile,
# OutputFormat.BINARY: serializer.serialize2BinaryFile
# }[opts.outputFormat](opts.outputFile)
|
|
260
|
|
|
261
|
if __name__ == '__main__':
|
|
262
|
import os
|
|
263
|
opts = _parseOptions()
|
|
264
|
main(opts)
|
|
265
|
|