Blame view

fsabuilder/morfeuszbuilder/fsa/encode.py 5.91 KB
Michał Lenart authored
1
2
3
'''
Created on Oct 23, 2013
Michał Lenart authored
4
@author: mlenart
Michał Lenart authored
5
6
'''
Michał Lenart authored
7
import logging
Michał Lenart authored
8
import itertools
Michał Lenart authored
9
from morfeuszbuilder.utils.serializationUtils import *
Michał Lenart authored
10
Michał Lenart authored
11
12
13
14
15
16
class Encoder(object):
    '''
    classdocs
    '''
Michał Lenart authored
17
    def __init__(self, lowercase, encoding='utf8'):
Michał Lenart authored
18
19
20
        '''
        Constructor
        '''
Michał Lenart authored
21
        self.lowercase = lowercase
Michał Lenart authored
22
        self.encoding = encoding
Michał Lenart authored
23
        self.qualifiersMap = { frozenset(): 0}
Michał Lenart authored
24
Michał Lenart authored
25
    def encodeWord(self, word, lowercase=True):
Michał Lenart authored
26
        assert type(word) == unicode
Michał Lenart authored
27
        res = bytearray(word.lower() if self.lowercase and lowercase else word, self.encoding)
Michał Lenart authored
28
        return res
Michał Lenart authored
29
30

    def encodeData(self, data):
Michał Lenart authored
31
        raise NotImplementedError()
Michał Lenart authored
32
33

    def decodeData(self, rawData):
Michał Lenart authored
34
        return NotImplementedError()
Michał Lenart authored
35
36
37

    def decodeWord(self, rawWord):
        return unicode(str(rawWord).strip('\x00'), self.encoding)
Michał Lenart authored
38
39

    def word2SortKey(self, word):
Michał Lenart authored
40
41
        normalizedWord = word.lower() if self.lowercase else word
        return normalizedWord.encode(self.encoding)
Michał Lenart authored
42
Michał Lenart authored
43
44
45
46
    def _encodeTypeNum(self, typenum):
        assert typenum >= 0 and typenum < 256
        return bytearray([typenum])
Michał Lenart authored
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
    def _encodeCasePattern(self, casePattern):
        res = bytearray()
        if True not in casePattern:
            res.append(self.LEMMA_ONLY_LOWER)
            return res
        elif self._hasUpperPrefix(casePattern):
            res.append(self.LEMMA_UPPER_PREFIX)
            res.append(self._getUpperPrefixLength(casePattern))
            return res
        else:
            assert len(casePattern) < 256
            res.append(self.LEMMA_MIXED_CASE)
            res.append(len([c for c in casePattern if c]))
            for idx in range(len(casePattern)):
                if casePattern[idx]:
                    res.append(idx)
            return res
Michał Lenart authored
65
66
67
68
69
70
71
72
    def _encodeQualifiers(self, qualifiers):
        res = bytearray()
        key = frozenset(qualifiers)
        if key in self.qualifiersMap:
            n = self.qualifiersMap[key]
        else:
            n = len(self.qualifiersMap)
            self.qualifiersMap[key] = n
Michał Lenart authored
73
        assert n < 500
Michał Lenart authored
74
        res.extend(htons(n))
Michał Lenart authored
75
76
        return res
Michał Lenart authored
77
    def _hasUpperPrefix(self, casePattern):
Michał Lenart authored
78
        for i in range(len(casePattern) + 1):
Michał Lenart authored
79
80
81
82
83
84
85
86
87
88
89
            if all(casePattern[:i]) and not any(casePattern[i:]):
                return True
        return False

    def _getUpperPrefixLength(self, casePattern):
        assert self._hasUpperPrefix(casePattern)
        for i in range(len(casePattern)):
            if not casePattern[i]:
                return i
        return len(casePattern)
Michał Lenart authored
90
91
92
93
94
95
96
97
98
99
    def _encodeTagNum(self, tagnum):
        res = bytearray()
        assert tagnum < 65536 and tagnum >= 0
        res.append((tagnum & 0xFF00) >> 8)
        res.append(tagnum & 0x00FF)
        return res

    def _encodeNameNum(self, namenum):
        assert namenum < 256 and namenum >= 0
        return bytearray([namenum])
Michał Lenart authored
100
101
102
103
104
105
106
107

    def _groupInterpsByType(self, interpsList):
        res = {}
        for interp in interpsList:
            res.setdefault(interp.typenum, [])
            res[interp.typenum].append(interp)
        return res
Michał Lenart authored
108
    def _getOrthCasePatterns(self, interpsList):
Michał Lenart authored
109
        res = []
Michał Lenart authored
110
        for interp in interpsList:
Michał Lenart authored
111
            if not True in interp.orthCasePattern:
Michał Lenart authored
112
                return []
Michał Lenart authored
113
            else:
Michał Lenart authored
114
                res.append(list(interp.orthCasePattern))
Michał Lenart authored
115
116
        return res
Michał Lenart authored
117
    def _encodeInterps4Type(self, typenum, interpsList, isAnalyzer):
Michał Lenart authored
118
119
120
        res = bytearray()
        res.extend(self._encodeTypeNum(typenum))
        encodedInterpsList = bytearray()
Michał Lenart authored
121
        if isAnalyzer:
Michał Lenart authored
122
            casePatterns = self._getOrthCasePatterns(interpsList)
Michał Lenart authored
123
124
125
            encodedInterpsList.append(len(casePatterns))
            for casePattern in casePatterns:
                encodedInterpsList.extend(self._encodeCasePattern(casePattern))
Michał Lenart authored
126
        for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
Michał Lenart authored
127
            if isAnalyzer:
Michał Lenart authored
128
                encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern))
Michał Lenart authored
129
            else:
Michał Lenart authored
130
131
                encodedInterpsList.extend(serializeString(interp.homonymId))
                encodedInterpsList.extend(serializeString(interp.encodedForm.prefixToAdd))
Michał Lenart authored
132
            encodedInterpsList.append(interp.encodedForm.cutLength)
Michał Lenart authored
133
            encodedInterpsList.extend(serializeString(interp.encodedForm.suffixToAdd))
Michał Lenart authored
134
135
136
137
            if isAnalyzer:
                encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern))
            encodedInterpsList.extend(htons(interp.tagnum))
            encodedInterpsList.append(interp.namenum)
Michał Lenart authored
138
            encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers))
Michał Lenart authored
139
Michał Lenart authored
140
        res.extend(htons(len(encodedInterpsList)))
Michał Lenart authored
141
142
143
        res.extend(encodedInterpsList)
        return res
Michał Lenart authored
144
    def _doEncodeData(self, interpsList, isAnalyzer):
Michał Lenart authored
145
146
147
148
149
150
151
152
153
154
155
156
157

        assert type(interpsList) == frozenset

        segnum2Interps = self._groupInterpsByType(interpsList)


        res = bytearray()
        firstByte = len(segnum2Interps)
        assert firstByte < 256
        assert firstByte > 0
        res.append(firstByte)

        for typenum, interpsList in segnum2Interps.iteritems():
Michał Lenart authored
158
            res.extend(self._encodeInterps4Type(typenum, interpsList, isAnalyzer))
Michał Lenart authored
159
        del interpsList
Michał Lenart authored
160
Michał Lenart authored
161
        return res
Michał Lenart authored
162
163
164
165

class MorphEncoder(Encoder):

    def __init__(self, encoding='utf8'):
Michał Lenart authored
166
        super(MorphEncoder, self).__init__(True, encoding)
Michał Lenart authored
167
168
169
170
171
        self.LEMMA_ONLY_LOWER = 0
        self.LEMMA_UPPER_PREFIX = 1
        self.LEMMA_MIXED_CASE = 2

    def encodeData(self, interpsList):
Michał Lenart authored
172
        return self._doEncodeData(interpsList, isAnalyzer=True)
Michał Lenart authored
173
174
175
176

class Encoder4Generator(Encoder):

    def __init__(self, encoding='utf8'):
Michał Lenart authored
177
        super(Encoder4Generator, self).__init__(False, encoding)
Michał Lenart authored
178
179

    def encodeData(self, interpsList):
Michał Lenart authored
180
        return self._doEncodeData(interpsList, isAnalyzer=False)