|
1
2
3
4
5
|
'''
Created on 20 lut 2014
@author: mlenart
'''
|
|
6
|
import logging
|
|
7
|
from morfeuszbuilder.utils.serializationUtils import htons, htonl
|
|
8
|
from morfeuszbuilder.utils import serializationUtils
|
|
9
|
|
|
10
11
|
class RulesManager(object):
|
|
12
|
def __init__(self, segtypes, separatorsList):
|
|
13
|
self.options2DFA = {}
|
|
14
|
self.segtypes = segtypes
|
|
15
|
self.separatorsList = separatorsList
|
|
16
|
self.defaultOptions = None
|
|
17
18
19
20
|
def _options2Key(self, optionsMap):
return frozenset(optionsMap.items())
|
|
21
22
23
|
def _key2Options(self, optionsKey):
return dict(optionsKey)
|
|
24
25
26
|
def getDFA(self, optionsMap):
return self.options2DFA[self._options2Key(optionsMap)]
|
|
27
28
29
|
def setDefaultOptions(self, key2Def):
self.defaultOptions = key2Def
|
|
30
|
def addDFA(self, optionsMap, dfa):
|
|
31
32
|
self.options2DFA[self._options2Key(optionsMap)] = dfa
|
|
33
34
|
def lexeme2SegmentTypeNum(self, lemma, tagnum):
res = self.segtypes.lexeme2Segnum(lemma, tagnum)
|
|
35
36
37
38
39
|
if res is None:
raise ValueError()
else:
return res
|
|
40
|
def serialize(self):
|
|
41
|
res = bytearray()
|
|
42
|
res.extend(self._serializeSeparatorsList())
|
|
43
44
45
46
47
48
49
|
dfasNum = len(self.options2DFA)
assert dfasNum > 0 and dfasNum < 256
res.append(dfasNum)
for key, dfa in self.options2DFA.iteritems():
optionsMap = self._key2Options(key)
res.extend(self._serializeOptionsMap(optionsMap))
res.extend(self._serializeDFA(dfa))
|
|
50
|
res.extend(self._serializeOptionsMap(self.defaultOptions))
|
|
51
|
logging.info('segmentation rules size: %s bytes', len(res))
|
|
52
|
# logging.info([int(x) for x in res])
|
|
53
54
|
return res
|
|
55
56
57
58
59
60
61
|
def _serializeSeparatorsList(self):
res = bytearray()
res.extend(serializationUtils.htons(len(self.separatorsList)))
for cp in sorted(self.separatorsList):
res.extend(serializationUtils.htonl(cp))
return res
|
|
62
63
64
|
def _serializeOptionsMap(self, optionsMap):
assert len(optionsMap) < 256
res = bytearray()
|
|
65
66
|
res.append(2)
res.extend(self._serializeString('aggl'))
|
|
67
|
res.extend(self._serializeString(optionsMap['aggl']))
|
|
68
|
res.extend(self._serializeString('praet'))
|
|
69
|
res.extend(self._serializeString(optionsMap['praet']))
|
|
70
71
72
73
|
return res
def _serializeDFA(self, dfa):
res = bytearray()
|
|
74
75
76
|
# serializer = SimpleSerializer(dfa, serializeTransitionsData=True)
dfaBytearray = dfa.serialize()
res.extend(htonl(len(dfaBytearray)))
|
|
77
78
79
80
81
|
res.extend(dfaBytearray)
return res
def _serializeString(self, string):
res = bytearray()
|
|
82
|
# res.append(len(string))
|
|
83
|
res.extend(string.encode('utf8'))
|
|
84
|
res.append(0)
|
|
85
|
return res
|