Blame view

fsabuilder/morfeuszbuilder/segrules/rulesManager.py 2.75 KB
Michał Lenart authored
1
2
3
4
5
'''
Created on 20 lut 2014

@author: mlenart
'''
Michał Lenart authored
6
import logging
Michał Lenart authored
7
from morfeuszbuilder.utils.serializationUtils import htons, htonl
Michał Lenart authored
8
from morfeuszbuilder.utils import serializationUtils
Michał Lenart authored
9
Michał Lenart authored
10
11
class RulesManager(object):
Michał Lenart authored
12
    def __init__(self, segtypes, separatorsList):
Michał Lenart authored
13
        self.options2DFA = {}
Michał Lenart authored
14
        self.segtypes = segtypes
Michał Lenart authored
15
        self.separatorsList = separatorsList
Michał Lenart authored
16
        self.defaultOptions = None
Michał Lenart authored
17
18
19
20

    def _options2Key(self, optionsMap):
        return frozenset(optionsMap.items())
Michał Lenart authored
21
22
23
    def _key2Options(self, optionsKey):
        return dict(optionsKey)
Michał Lenart authored
24
25
26
    def getDFA(self, optionsMap):
        return self.options2DFA[self._options2Key(optionsMap)]
Michał Lenart authored
27
28
29
    def setDefaultOptions(self, key2Def):
        self.defaultOptions = key2Def
Michał Lenart authored
30
    def addDFA(self, optionsMap, dfa):
Michał Lenart authored
31
32
        self.options2DFA[self._options2Key(optionsMap)] = dfa
Michał Lenart authored
33
34
    def lexeme2SegmentTypeNum(self, lemma, tagnum):
        res = self.segtypes.lexeme2Segnum(lemma, tagnum)
Michał Lenart authored
35
36
37
38
39
        if res is None:
            raise ValueError()
        else:
            return res
Michał Lenart authored
40
    def serialize(self):
Michał Lenart authored
41
        res = bytearray()
Michał Lenart authored
42
        res.extend(self._serializeSeparatorsList())
Michał Lenart authored
43
44
45
46
47
48
49
        dfasNum = len(self.options2DFA)
        assert dfasNum > 0 and dfasNum < 256
        res.append(dfasNum)
        for key, dfa in self.options2DFA.iteritems():
            optionsMap = self._key2Options(key)
            res.extend(self._serializeOptionsMap(optionsMap))
            res.extend(self._serializeDFA(dfa))
Michał Lenart authored
50
        res.extend(self._serializeOptionsMap(self.defaultOptions))
Michał Lenart authored
51
        logging.info('segmentation rules size: %s bytes', len(res))
Michał Lenart authored
52
#         logging.info([int(x) for x in res])
Michał Lenart authored
53
54
        return res
Michał Lenart authored
55
56
57
58
59
60
61
    def _serializeSeparatorsList(self):
        res = bytearray()
        res.extend(serializationUtils.htons(len(self.separatorsList)))
        for cp in sorted(self.separatorsList):
            res.extend(serializationUtils.htonl(cp))
        return res
Michał Lenart authored
62
63
64
    def _serializeOptionsMap(self, optionsMap):
        assert len(optionsMap) < 256
        res = bytearray()
Michał Lenart authored
65
66
        res.append(2)
        res.extend(self._serializeString('aggl'))
Michał Lenart authored
67
        res.extend(self._serializeString(optionsMap['aggl']))
Michał Lenart authored
68
        res.extend(self._serializeString('praet'))
Michał Lenart authored
69
        res.extend(self._serializeString(optionsMap['praet']))
Michał Lenart authored
70
71
72
73
        return res

    def _serializeDFA(self, dfa):
        res = bytearray()
Michał Lenart authored
74
75
76
#         serializer = SimpleSerializer(dfa, serializeTransitionsData=True)
        dfaBytearray = dfa.serialize()
        res.extend(htonl(len(dfaBytearray)))
Michał Lenart authored
77
78
79
80
81
        res.extend(dfaBytearray)
        return res

    def _serializeString(self, string):
        res = bytearray()
Michał Lenart authored
82
#         res.append(len(string))
Michał Lenart authored
83
        res.extend(string.encode('utf8'))
Michał Lenart authored
84
        res.append(0)
Michał Lenart authored
85
        return res