buildfsa.py
3.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding:utf-8 -*-
'''
Created on 21 paź 2013
@author: mlenart
'''
import sys
import logging
import codecs
import encode
import convertinput
from fsa import FSA
from serializer import SimpleSerializerWithStringValues
from visualizer import Visualizer
from optparse import OptionParser
logging.basicConfig(level=logging.DEBUG)
class OutputFormat():
BINARY = 'BINARY'
CPP = 'CPP'
class InputFormat():
ENCODED = 'ENCODED'
POLIMORF = 'POLIMORF'
PLAIN = 'PLAIN'
def parseOptions():
"""
Parses commandline args
"""
parser = OptionParser()
parser.add_option('-i', '--input-file',
dest='inputFile',
metavar='FILE',
help='path to input file')
parser.add_option('-o', '--output-file',
dest='outputFile',
metavar='FILE',
help='path to output file')
parser.add_option('--input-format',
dest='inputFormat',
help='input format - ENCODED or POLIMORF')
parser.add_option('--output-format',
dest='outputFormat',
help='output format - BINARY or CPP')
parser.add_option('--visualize',
dest='visualize',
action='store_true',
default=False,
help='visualize result')
opts, args = parser.parse_args()
if None in [opts.inputFile, opts.outputFile, opts.outputFormat, opts.inputFormat]:
parser.print_help()
exit(1)
if not opts.outputFormat.upper() in [OutputFormat.BINARY, OutputFormat.CPP]:
print >> sys.stderr, 'output format must be one of ('+str([OutputFormat.BINARY, OutputFormat.CPP])+')'
exit(1)
if not opts.inputFormat.upper() in [InputFormat.ENCODED, InputFormat.POLIMORF, InputFormat.PLAIN]:
print >> sys.stderr, 'input format must be one of ('+str([InputFormat.ENCODED, InputFormat.POLIMORF])+')'
exit(1)
return opts
def readEncodedInput(inputFile):
with codecs.open(inputFile, 'r', 'utf8') as f:
for line in f.readlines():
word, interps = line.strip().split()
yield word, interps.split(u'|')
def readPolimorfInput(inputFile, encoder):
with codecs.open(inputFile, 'r', 'utf8') as f:
for entry in convertinput.convertPolimorf(f.readlines(), lambda (word, interp): encoder.word2SortKey(word)):
yield entry
def readPlainInput(inputFile, encoder):
with codecs.open(inputFile, 'r', 'utf8') as f:
for line in sorted(f.readlines(), key=encoder.word2SortKey):
word = line.strip()
yield word, ''
if __name__ == '__main__':
opts = parseOptions()
encoder = encode.Encoder()
fsa = FSA(encoder)
serializer = SimpleSerializerWithStringValues()
inputData = {
InputFormat.ENCODED: readEncodedInput(opts.inputFile),
InputFormat.POLIMORF: readPolimorfInput(opts.inputFile, encoder),
InputFormat.PLAIN: readPlainInput(opts.inputFile, encoder)
}[opts.inputFormat]
logging.info('feeding FSA with data ...')
fsa.feed(inputData)
logging.info('states num: '+str(fsa.getStatesNum()))
if opts.outputFormat == 'CPP':
serializer.serialize2CppFile(fsa, opts.outputFile)
else:
serializer.serialize2BinaryFile(fsa, opts.outputFile)
if opts.visualize:
Visualizer().visualize(fsa)