rulesManager.py
2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
'''
Created on 20 lut 2014
@author: mlenart
'''
import logging
from morfeuszbuilder.fsa.serializer import SimpleSerializer
class RulesManager(object):
def __init__(self, segtypes):
self.options2DFA = {}
self.segtypes = segtypes
def _options2Key(self, optionsMap):
return frozenset(optionsMap.items())
def _key2Options(self, optionsKey):
return dict(optionsKey)
def getDFA(self, optionsMap):
return self.options2DFA[self._options2Key(optionsMap)]
def addDFA(self, optionsMap, dfa):
self.options2DFA[self._options2Key(optionsMap)] = dfa
def lexeme2SegmentTypeNum(self, lemma, tagnum):
res = self.segtypes.lexeme2Segnum(lemma, tagnum)
if res is None:
raise ValueError()
else:
return res
def serialize(self):
res = bytearray()
dfasNum = len(self.options2DFA)
assert dfasNum > 0 and dfasNum < 256
res.append(dfasNum)
for key, dfa in self.options2DFA.iteritems():
optionsMap = self._key2Options(key)
res.extend(self._serializeOptionsMap(optionsMap))
res.extend(self._serializeDFA(dfa))
logging.info('segmentation rules size: %s bytes', len(res))
return res
def _serializeOptionsMap(self, optionsMap):
assert len(optionsMap) < 256
res = bytearray()
res.extend(self._serializeString(optionsMap['aggl']))
res.extend(self._serializeString(optionsMap['praet']))
return res
def _serializeDFA(self, dfa):
res = bytearray()
serializer = SimpleSerializer(dfa, serializeTransitionsData=True)
dfaBytearray = serializer.fsa2bytearray()
res.extend(serializer.htonl(len(dfaBytearray)))
res.extend(dfaBytearray)
return res
def _serializeString(self, string):
res = bytearray()
# res.append(len(string))
res.extend(string.encode('utf8'))
res.append(0)
return res