Blame view

fsabuilder/morfeuszbuilder/fsa/encode.py 5.1 KB
Michał Lenart authored
1
2
3
'''
Created on Oct 23, 2013
Michał Lenart authored
4
@author: mlenart
Michał Lenart authored
5
6
'''
Michał Lenart authored
7
import logging
Michał Lenart authored
8
from morfeuszbuilder.utils import serializationUtils
Michał Lenart authored
9
Michał Lenart authored
10
11
12
13
14
15
class Encoder(object):
    '''
    classdocs
    '''
Michał Lenart authored
16
    def __init__(self, encoding='utf8'):
Michał Lenart authored
17
18
19
20
21
        '''
        Constructor
        '''
        self.encoding = encoding
Michał Lenart authored
22
    def encodeWord(self, word, lowercase=True):
Michał Lenart authored
23
        assert type(word) == unicode
Michał Lenart authored
24
        res = bytearray(word.lower() if lowercase else word, self.encoding)
Michał Lenart authored
25
        return res
Michał Lenart authored
26
27

    def encodeData(self, data):
Michał Lenart authored
28
        raise NotImplementedError()
Michał Lenart authored
29
30

    def decodeData(self, rawData):
Michał Lenart authored
31
        return NotImplementedError()
Michał Lenart authored
32
33
34

    def decodeWord(self, rawWord):
        return unicode(str(rawWord).strip('\x00'), self.encoding)
Michał Lenart authored
35
36

    def word2SortKey(self, word):
Michał Lenart authored
37
38
        return word.lower().encode(self.encoding)
Michał Lenart authored
39
40
41
42
    def _encodeTypeNum(self, typenum):
        assert typenum >= 0 and typenum < 256
        return bytearray([typenum])
Michał Lenart authored
43
    def _encodeEncodedForm(self, form, withCasePattern, withPrefix=False):
Michał Lenart authored
44
        res = bytearray()
Michał Lenart authored
45
        assert form.cutLength < 256 and form.cutLength >= 0
Michał Lenart authored
46
47
48
        if withPrefix:
            res.extend(self.encodeWord(form.prefixToAdd, lowercase=False))
            res.append(0)
Michał Lenart authored
49
50
        res.append(form.cutLength)
        res.extend(self.encodeWord(form.suffixToAdd, lowercase=False))
Michał Lenart authored
51
        res.append(0)
Michał Lenart authored
52
53
        if withCasePattern:
            res.extend(self._encodeCasePattern(form.casePattern))
Michał Lenart authored
54
55
        return res
Michał Lenart authored
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
    def _encodeCasePattern(self, casePattern):
        res = bytearray()
        if True not in casePattern:
            res.append(self.LEMMA_ONLY_LOWER)
            return res
        elif self._hasUpperPrefix(casePattern):
            res.append(self.LEMMA_UPPER_PREFIX)
            res.append(self._getUpperPrefixLength(casePattern))
            return res
        else:
            assert len(casePattern) < 256
            res.append(self.LEMMA_MIXED_CASE)
            res.append(len([c for c in casePattern if c]))
            for idx in range(len(casePattern)):
                if casePattern[idx]:
                    res.append(idx)
            return res

    def _hasUpperPrefix(self, casePattern):
Michał Lenart authored
75
        for i in range(len(casePattern) + 1):
Michał Lenart authored
76
77
78
79
80
81
82
83
84
85
86
            if all(casePattern[:i]) and not any(casePattern[i:]):
                return True
        return False

    def _getUpperPrefixLength(self, casePattern):
        assert self._hasUpperPrefix(casePattern)
        for i in range(len(casePattern)):
            if not casePattern[i]:
                return i
        return len(casePattern)
Michał Lenart authored
87
88
89
90
91
92
93
94
95
96
97
98
    def _encodeTagNum(self, tagnum):
        res = bytearray()
#         logging.info((tagnum & 0xFF00) >> 8)
        assert tagnum < 65536 and tagnum >= 0
        res.append((tagnum & 0xFF00) >> 8)
        res.append(tagnum & 0x00FF)
#         logging.info('%d %s %s' % (tagnum, hex(res[0]), hex(res[1])))
        return res

    def _encodeNameNum(self, namenum):
        assert namenum < 256 and namenum >= 0
        return bytearray([namenum])
Michał Lenart authored
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

    def _groupInterpsByType(self, interpsList):
        res = {}
        for interp in interpsList:
            res.setdefault(interp.typenum, [])
            res[interp.typenum].append(interp)
        return res

    def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix):
        res = bytearray()
        res.extend(self._encodeTypeNum(typenum))

        encodedInterpsList = bytearray()
        for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
            encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix))
            encodedInterpsList.extend(self._encodeTagNum(interp.tagnum))
            encodedInterpsList.extend(self._encodeNameNum(interp.namenum))

        res.extend(serializationUtils.htons(len(encodedInterpsList)))
        res.extend(encodedInterpsList)
        return res

    def _doEncodeData(self, interpsList, withCasePattern, withPrefix):

        assert type(interpsList) == frozenset

        segnum2Interps = self._groupInterpsByType(interpsList)


        res = bytearray()
        firstByte = len(segnum2Interps)
        assert firstByte < 256
        assert firstByte > 0
        res.append(firstByte)

        for typenum, interpsList in segnum2Interps.iteritems():
            res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix))
        del interpsList
Michał Lenart authored
137
Michał Lenart authored
138
        return res
Michał Lenart authored
139
140
141
142
143
144
145
146
147
148

class MorphEncoder(Encoder):

    def __init__(self, encoding='utf8'):
        super(MorphEncoder, self).__init__(encoding)
        self.LEMMA_ONLY_LOWER = 0
        self.LEMMA_UPPER_PREFIX = 1
        self.LEMMA_MIXED_CASE = 2

    def encodeData(self, interpsList):
Michał Lenart authored
149
        return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False)
Michał Lenart authored
150
151
152
153

class Encoder4Generator(Encoder):

    def __init__(self, encoding='utf8'):
Michał Lenart authored
154
        super(Encoder4Generator, self).__init__(encoding)
Michał Lenart authored
155
156

    def encodeData(self, interpsList):
Michał Lenart authored
157
        return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True)