Blame view

fsabuilder/buildfsa.py 10.1 KB
Michał Lenart authored
1
#!/usr/bin/python
Michał Lenart authored
2
3
4
5
6
7
# -*- coding:utf-8 -*-
'''
Created on 21 paź 2013

@author: mlenart
'''
Michał Lenart authored
8
import os
Michał Lenart authored
9
10
11
import sys
import logging
import codecs
Michał Lenart authored
12
13
14
15
from morfeuszbuilder.fsa import encode
from morfeuszbuilder.fsa import convertinput
from morfeuszbuilder.fsa.fsa import FSA
from morfeuszbuilder.fsa.serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer
Michał Lenart authored
16
17
from morfeuszbuilder.tagset.tagset import Tagset
from morfeuszbuilder.segrules import rulesParser
Michał Lenart authored
18
19
from optparse import OptionParser
Michał Lenart authored
20
21
22
23
# class InputFormat():
#     ENCODED = 'ENCODED'
#     POLIMORF = 'POLIMORF'
#     PLAIN = 'PLAIN'
Michał Lenart authored
24
Michał Lenart authored
25
26
27
28
29
class SerializationMethod():
    SIMPLE = 'SIMPLE'
    V1 = 'V1'
    V2 = 'V2'
Michał Lenart authored
30
31
32
33
34
35
36
37
38
39
40
41
def _checkOption(opt, parser, msg):
    if opt is None:
        print >> sys.stderr, msg
        parser.print_help()
        exit(1)

def _checkExactlyOneOptionSet(optsList, parser, msg):
    if optsList.count(True) != 1:
        print >> sys.stderr, msg
        parser.print_help()
        exit(1)
Michał Lenart authored
42
43
44
def _parseListCallback(option, opt, value, parser):
    setattr(parser.values, option.dest, value.split(','))
Michał Lenart authored
45
46
47
48
49
50
51
52
def _checkOpen(filename, mode):
    try:
        with open(filename, mode) as _:
            pass
    except IOError as ex:
        print >> sys.stderr, str(ex)
        exit(1)
Michał Lenart authored
53
def _parseOptions():
Michał Lenart authored
54
55
56
57
    """
    Parses commandline args
    """
    parser = OptionParser()
Michał Lenart authored
58
    parser.add_option('--input-files',
Michał Lenart authored
59
60
61
62
63
64
                        type='string',
                        dest='inputFiles',
                        action='callback',
                        callback=_parseListCallback,
                        metavar='FILES',
                        help='comma separated list of files')
Michał Lenart authored
65
66
67
68
    parser.add_option('--tagset-file',
                        dest='tagsetFile',
                        metavar='FILE',
                        help='path to the file with tagset')
Michał Lenart authored
69
70
71
72
    parser.add_option('--segments-file',
                        dest='segmentsFile',
                        metavar='FILE',
                        help='path to the file with segment rules')
Michał Lenart authored
73
74
75
76
77
    parser.add_option('--trim-supneg',
                        dest='trimSupneg',
                        default=False,
                        action='store_true',
                        help='trim "naj" and "nie" prefixes from words tagged as "%:sup" and "%:neg" respectively. Valid only for analysis.')
Michał Lenart authored
78
79
80
81
    parser.add_option('-o', '--output-file',
                        dest='outputFile',
                        metavar='FILE',
                        help='path to output file')
Michał Lenart authored
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
    parser.add_option('-a', '--analyzer',
                        dest='analyzer',
                        action='store_true',
                        default=False,
                        help='Generate FSA for morphological analysis')
    parser.add_option('-g', '--generator',
                        dest='generator',
                        action='store_true',
                        default=False,
                        help='Generate FSA for morphological synthesis')
    parser.add_option('--cpp',
                        dest='cpp',
                        action='store_true',
                        default=False,
                        help='Encode binary data in c++ file')
Michał Lenart authored
97
98
99
100
    parser.add_option('--use-arrays',
                        dest='useArrays',
                        action='store_true',
                        default=False,
Michał Lenart authored
101
102
103
104
105
106
107
                        help='store states reachable by 2 transitions in arrays (should speed up recognition, available only when --serialization-method=V1)')
    parser.add_option('--serialization-method',
                        dest='serializationMethod',
                        help="FSA serialization method: \
                        SIMPLE - fixed-length transitions, fastest and weakest compression \
                        V1 - variable-length transitions, compressed labels - strongest compression \
                        V2 - format similar to the default in Jan Daciuk's fsa package - variable-length transitions, non-compressed labels - good compression, good speed")
Michał Lenart authored
108
109
110
111
112
    #~ parser.add_option('--visualize',
                        #~ dest='visualize',
                        #~ action='store_true', 
                        #~ default=False,
                        #~ help='visualize result')
Michał Lenart authored
113
114
115
    parser.add_option('--train-file',
                        dest='trainFile',
                        help='A text file used for training. Should contain words from some large corpus - one word in each line')
Michał Lenart authored
116
117
118
119
120
    parser.add_option('--debug',
                        dest='debug',
                        action='store_true',
                        default=False,
                        help='output some debugging info')
Michał Lenart authored
121
122
123
124
125
    #~ parser.add_option('--profile',
                        #~ dest='profile',
                        #~ action='store_true',
                        #~ default=False,
                        #~ help='show profiling graph (required pycallgraph and graphviz')
Michał Lenart authored
126
127
128

    opts, args = parser.parse_args()
Michał Lenart authored
129
    _checkOption(opts.inputFiles, parser, "Input file is missing")
Michał Lenart authored
130
131
132
133
134
    _checkOption(opts.outputFile, parser, "Output file is missing")
    _checkOption(opts.tagsetFile, parser, "Tagset file is missing")
    _checkOption(opts.serializationMethod, parser, "Serialization method file is missing")
    _checkExactlyOneOptionSet([opts.analyzer, opts.generator], 
                              parser, 'Must set exactly one FSA type: --analyzer or --generator')
Michał Lenart authored
135
136

    _checkOpen(opts.tagsetFile, 'r')
Michał Lenart authored
137
138
139
    for filename in opts.inputFiles:
        _checkOpen(filename, 'r')
    _checkOpen(opts.outputFile, 'w')
Michał Lenart authored
140
141
    if opts.analyzer:
        _checkOption(opts.segmentsFile, parser, "Segment rules file is missing")
Michał Lenart authored
142
        _checkOpen(opts.segmentsFile, 'r')
Michał Lenart authored
143
144

    if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2]:
Michał Lenart authored
145
        print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2])+')'
Michał Lenart authored
146
        parser.print_help()
Michał Lenart authored
147
        exit(1)
Michał Lenart authored
148
Michał Lenart authored
149
150
    return opts
Michał Lenart authored
151
def _concatFiles(inputFiles):
Michał Lenart authored
152
#     return open(inputFiles[0], 'r')
Michał Lenart authored
153
154
155
156
157
158
159
160
161
162
    for inputFile in inputFiles:
        if inputFile:
            with open(inputFile, 'r') as f:
                for line in f:
                    yield line

def _readPolimorfInput4Analyzer(inputFiles, tagset, encoder, segmentRulesManager, trimSupneg):
    logging.info('reading analyzer data from %s', str(inputFiles))
    for entry in convertinput.PolimorfConverter4Analyzer(tagset, encoder, 'utf8', segmentRulesManager, trimSupneg).convert(_concatFiles(inputFiles)):
        yield entry
Michał Lenart authored
163
Michał Lenart authored
164
165
166
167
def _readPolimorfInput4Generator(inputFiles, tagset, encoder):
    logging.info('reading generator data from %s', str(inputFiles))
    for entry in convertinput.PolimorfConverter4Generator(tagset, encoder, 'utf8').convert(_concatFiles(inputFiles)):
        yield entry
Michał Lenart authored
168
Michał Lenart authored
169
def _readTrainData(trainFile):
Michał Lenart authored
170
171
172
173
    with codecs.open(trainFile, 'r', 'utf8') as f:
        for line in f:
            yield line.strip()
Michał Lenart authored
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def _printStats(fsa):
    acceptingNum = 0
    sinkNum = 0
    arrayNum = 0
    for s in fsa.dfs():
        if s.isAccepting():
            acceptingNum += 1
        if s.transitionsNum == 0:
            sinkNum += 1
        if s.serializeAsArray:
            arrayNum += 1
    logging.info('states num: '+str(fsa.getStatesNum()))
    logging.info('transitions num: '+str(fsa.getTransitionsNum()))
    logging.info('accepting states num: '+str(acceptingNum))
    logging.info('sink states num: '+str(sinkNum))
    logging.info('array states num: '+str(arrayNum))
Michał Lenart authored
191
def buildAnalyzerFromPoliMorf(inputFiles, tagset, segmentRulesManager, trimSupneg):
Michał Lenart authored
192
    encoder = encode.MorphEncoder()
Michał Lenart authored
193
    fsa = FSA(encoder, tagset)
Michał Lenart authored
194
195
    for word, data in _readPolimorfInput4Analyzer(inputFiles, tagset, encoder, segmentRulesManager, trimSupneg):
#         print word, data
Michał Lenart authored
196
197
        fsa.addEntry(word, data)
    fsa.close()
Michał Lenart authored
198
199
200
    logging.info('------')
    logging.info('Analyzer FSA stats:')
    logging.info('------')
Michał Lenart authored
201
202
203
    _printStats(fsa)
    return fsa
Michał Lenart authored
204
def buildGeneratorFromPoliMorf(inputFiles, tagset):
Michał Lenart authored
205
206
    encoder = encode.Encoder4Generator()
    fsa = FSA(encoder, tagset)
Michał Lenart authored
207
    inputData = _readPolimorfInput4Generator(inputFiles, tagset, encoder)
Michał Lenart authored
208
209
210
    for word, data in inputData:
        fsa.addEntry(word, data)
    fsa.close()
Michał Lenart authored
211
212
213
    logging.info('------')
    logging.info('Generator FSA stats:')
    logging.info('------')
Michał Lenart authored
214
215
    _printStats(fsa)
    return fsa
Michał Lenart authored
216
Michał Lenart authored
217
def main(opts):
Michał Lenart authored
218
219
220
221
    if opts.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
Michał Lenart authored
222
Michał Lenart authored
223
224
225
226
227
228
    if opts.analyzer:
        logging.info('*** building analyzer ***')
    else:
        logging.info('*** building generator ***')

    logging.info('reading tagset from %s', opts.tagsetFile)
Michał Lenart authored
229
230
    tagset = Tagset(opts.tagsetFile)
Michał Lenart authored
231
    if opts.analyzer:
Michał Lenart authored
232
233
        segmentRulesManager = rulesParser.RulesParser(tagset).parse(opts.segmentsFile)
        additionalData = segmentRulesManager.serialize()
Michał Lenart authored
234
        fsa = buildAnalyzerFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager, opts.trimSupneg)
Michał Lenart authored
235
    else:
Michał Lenart authored
236
        fsa = buildGeneratorFromPoliMorf(opts.inputFiles, tagset)
Michał Lenart authored
237
        additionalData = bytearray()
Michał Lenart authored
238
Michał Lenart authored
239
240
    if opts.trainFile:
        logging.info('training with '+opts.trainFile+' ...')
Michał Lenart authored
241
        fsa.train(_readTrainData(opts.trainFile))
Michał Lenart authored
242
        logging.info('done training')
Michał Lenart authored
243
244
245
246
247
248

    serializer = {
                  SerializationMethod.SIMPLE: SimpleSerializer,
                  SerializationMethod.V1: VLengthSerializer1,
                  SerializationMethod.V2: VLengthSerializer2,
                  }[opts.serializationMethod](fsa)
Michał Lenart authored
249
250

    if opts.cpp:
Michał Lenart authored
251
        serializer.serialize2CppFile(opts.outputFile, generator=opts.generator, additionalData=additionalData)
Michał Lenart authored
252
    else:
Michał Lenart authored
253
        serializer.serialize2BinaryFile(opts.outputFile, additionalData=additionalData)
Michał Lenart authored
254
255

    logging.info('total FSA size (in bytes): '+str(fsa.initialState.reverseOffset))
Michał Lenart authored
256
257
258
259
#     {
#      OutputFormat.CPP: serializer.serialize2CppFile,
#      OutputFormat.BINARY: serializer.serialize2BinaryFile
#      }[opts.outputFormat](opts.outputFile)
Michał Lenart authored
260
Michał Lenart authored
261
if __name__ == '__main__':
Michał Lenart authored
262
    import os
Michał Lenart authored
263
    opts = _parseOptions()
Michał Lenart authored
264
    main(opts)
Michał Lenart authored
265